[PATCH v2] RISC-V: KVM: Support runtime configuration for per-VM's HGATP mode

fangyu.yu@linux.alibaba.com posted 1 patch 1 month ago
arch/riscv/include/asm/kvm_gstage.h | 12 ++---
arch/riscv/include/asm/kvm_host.h   |  4 ++
arch/riscv/kvm/gstage.c             | 82 +++++++++++++++++------------
arch/riscv/kvm/main.c               |  4 +-
arch/riscv/kvm/mmu.c                | 18 +++++--
arch/riscv/kvm/vm.c                 |  2 +-
arch/riscv/kvm/vmid.c               |  2 +-
7 files changed, 74 insertions(+), 50 deletions(-)
[PATCH v2] RISC-V: KVM: Support runtime configuration for per-VM's HGATP mode
Posted by fangyu.yu@linux.alibaba.com 1 month ago
From: Fangyu Yu <fangyu.yu@linux.alibaba.com>

Introduces two per-VM architecture-specific fields to support runtime
configuration of the G-stage page table format:

- kvm->arch.kvm_riscv_gstage_mode: specifies the HGATP mode used by the
  current VM;
- kvm->arch.kvm_riscv_gstage_pgd_levels: the corresponding number of page
  table levels for the selected mode.

These fields replace the previous global variables
kvm_riscv_gstage_mode and kvm_riscv_gstage_pgd_levels, enabling different
virtual machines to independently select their G-stage page table format
instead of being forced to share the maximum mode detected by the kernel
at boot time.

Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
---
 arch/riscv/include/asm/kvm_gstage.h | 12 ++---
 arch/riscv/include/asm/kvm_host.h   |  4 ++
 arch/riscv/kvm/gstage.c             | 82 +++++++++++++++++------------
 arch/riscv/kvm/main.c               |  4 +-
 arch/riscv/kvm/mmu.c                | 18 +++++--
 arch/riscv/kvm/vm.c                 |  2 +-
 arch/riscv/kvm/vmid.c               |  2 +-
 7 files changed, 74 insertions(+), 50 deletions(-)

diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
index 595e2183173e..fdcada123b3f 100644
--- a/arch/riscv/include/asm/kvm_gstage.h
+++ b/arch/riscv/include/asm/kvm_gstage.h
@@ -29,16 +29,11 @@ struct kvm_gstage_mapping {
 #define kvm_riscv_gstage_index_bits	10
 #endif
 
-extern unsigned long kvm_riscv_gstage_mode;
-extern unsigned long kvm_riscv_gstage_pgd_levels;
+extern unsigned long kvm_riscv_gstage_max_mode;
+extern unsigned long kvm_riscv_gstage_max_pgd_levels;
 
 #define kvm_riscv_gstage_pgd_xbits	2
 #define kvm_riscv_gstage_pgd_size	(1UL << (HGATP_PAGE_SHIFT + kvm_riscv_gstage_pgd_xbits))
-#define kvm_riscv_gstage_gpa_bits	(HGATP_PAGE_SHIFT + \
-					 (kvm_riscv_gstage_pgd_levels * \
-					  kvm_riscv_gstage_index_bits) + \
-					 kvm_riscv_gstage_pgd_xbits)
-#define kvm_riscv_gstage_gpa_size	((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits))
 
 bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
 			       pte_t **ptepp, u32 *ptep_level);
@@ -69,4 +64,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
 
 void kvm_riscv_gstage_mode_detect(void);
 
+gpa_t kvm_riscv_gstage_gpa_size(struct kvm_arch *k);
+unsigned long kvm_riscv_gstage_gpa_bits(struct kvm_arch *k);
+
 #endif
diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
index 24585304c02b..27ea8e8fd5b0 100644
--- a/arch/riscv/include/asm/kvm_host.h
+++ b/arch/riscv/include/asm/kvm_host.h
@@ -103,6 +103,10 @@ struct kvm_arch {
 
 	/* KVM_CAP_RISCV_MP_STATE_RESET */
 	bool mp_state_reset;
+
+	unsigned long kvm_riscv_gstage_mode;
+	unsigned long kvm_riscv_gstage_pgd_levels;
+	bool gstage_mode_initialized;
 };
 
 struct kvm_cpu_trap {
diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
index b67d60d722c2..06452e4c2ab2 100644
--- a/arch/riscv/kvm/gstage.c
+++ b/arch/riscv/kvm/gstage.c
@@ -12,22 +12,23 @@
 #include <asm/kvm_gstage.h>
 
 #ifdef CONFIG_64BIT
-unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4;
-unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3;
+unsigned long kvm_riscv_gstage_max_mode __ro_after_init = HGATP_MODE_SV39X4;
+unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 3;
 #else
-unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4;
-unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2;
+unsigned long kvm_riscv_gstage_max_mode __ro_after_init = HGATP_MODE_SV32X4;
+unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 2;
 #endif
 
 #define gstage_pte_leaf(__ptep)	\
 	(pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
 
-static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
+static inline unsigned long gstage_pte_index(struct kvm_gstage *gstage,
+					     gpa_t addr, u32 level)
 {
 	unsigned long mask;
 	unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level);
 
-	if (level == (kvm_riscv_gstage_pgd_levels - 1))
+	if (level == (gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1))
 		mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1;
 	else
 		mask = PTRS_PER_PTE - 1;
@@ -40,12 +41,13 @@ static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
 	return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
 }
 
-static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
+static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long page_size,
+				     u32 *out_level)
 {
 	u32 i;
 	unsigned long psz = 1UL << 12;
 
-	for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) {
+	for (i = 0; i < gstage->kvm->arch.kvm_riscv_gstage_pgd_levels; i++) {
 		if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) {
 			*out_level = i;
 			return 0;
@@ -55,21 +57,23 @@ static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
 	return -EINVAL;
 }
 
-static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
+static int gstage_level_to_page_order(struct kvm_gstage *gstage, u32 level,
+				      unsigned long *out_pgorder)
 {
-	if (kvm_riscv_gstage_pgd_levels < level)
+	if (gstage->kvm->arch.kvm_riscv_gstage_pgd_levels < level)
 		return -EINVAL;
 
 	*out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits);
 	return 0;
 }
 
-static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
+static int gstage_level_to_page_size(struct kvm_gstage *gstage, u32 level,
+				     unsigned long *out_pgsize)
 {
 	int rc;
 	unsigned long page_order = PAGE_SHIFT;
 
-	rc = gstage_level_to_page_order(level, &page_order);
+	rc = gstage_level_to_page_order(gstage, level, &page_order);
 	if (rc)
 		return rc;
 
@@ -81,11 +85,11 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
 			       pte_t **ptepp, u32 *ptep_level)
 {
 	pte_t *ptep;
-	u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
+	u32 current_level = gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1;
 
 	*ptep_level = current_level;
 	ptep = (pte_t *)gstage->pgd;
-	ptep = &ptep[gstage_pte_index(addr, current_level)];
+	ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
 	while (ptep && pte_val(ptep_get(ptep))) {
 		if (gstage_pte_leaf(ptep)) {
 			*ptep_level = current_level;
@@ -97,7 +101,7 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
 			current_level--;
 			*ptep_level = current_level;
 			ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
-			ptep = &ptep[gstage_pte_index(addr, current_level)];
+			ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
 		} else {
 			ptep = NULL;
 		}
@@ -110,7 +114,7 @@ static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr)
 {
 	unsigned long order = PAGE_SHIFT;
 
-	if (gstage_level_to_page_order(level, &order))
+	if (gstage_level_to_page_order(gstage, level, &order))
 		return;
 	addr &= ~(BIT(order) - 1);
 
@@ -125,9 +129,9 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
 			     struct kvm_mmu_memory_cache *pcache,
 			     const struct kvm_gstage_mapping *map)
 {
-	u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
+	u32 current_level = gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1;
 	pte_t *next_ptep = (pte_t *)gstage->pgd;
-	pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
+	pte_t *ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
 
 	if (current_level < map->level)
 		return -EINVAL;
@@ -151,7 +155,7 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
 		}
 
 		current_level--;
-		ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
+		ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
 	}
 
 	if (pte_val(*ptep) != pte_val(map->pte)) {
@@ -175,7 +179,7 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
 	out_map->addr = gpa;
 	out_map->level = 0;
 
-	ret = gstage_page_size_to_level(page_size, &out_map->level);
+	ret = gstage_page_size_to_level(gstage, page_size, &out_map->level);
 	if (ret)
 		return ret;
 
@@ -217,7 +221,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
 	u32 next_ptep_level;
 	unsigned long next_page_size, page_size;
 
-	ret = gstage_level_to_page_size(ptep_level, &page_size);
+	ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
 	if (ret)
 		return;
 
@@ -229,7 +233,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
 	if (ptep_level && !gstage_pte_leaf(ptep)) {
 		next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
 		next_ptep_level = ptep_level - 1;
-		ret = gstage_level_to_page_size(next_ptep_level, &next_page_size);
+		ret = gstage_level_to_page_size(gstage, next_ptep_level, &next_page_size);
 		if (ret)
 			return;
 
@@ -263,7 +267,7 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
 
 	while (addr < end) {
 		found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
-		ret = gstage_level_to_page_size(ptep_level, &page_size);
+		ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
 		if (ret)
 			break;
 
@@ -297,7 +301,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
 
 	while (addr < end) {
 		found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
-		ret = gstage_level_to_page_size(ptep_level, &page_size);
+		ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
 		if (ret)
 			break;
 
@@ -319,41 +323,51 @@ void __init kvm_riscv_gstage_mode_detect(void)
 	/* Try Sv57x4 G-stage mode */
 	csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
 	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
-		kvm_riscv_gstage_mode = HGATP_MODE_SV57X4;
-		kvm_riscv_gstage_pgd_levels = 5;
+		kvm_riscv_gstage_max_mode = HGATP_MODE_SV57X4;
+		kvm_riscv_gstage_max_pgd_levels = 5;
 		goto done;
 	}
 
 	/* Try Sv48x4 G-stage mode */
 	csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
 	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
-		kvm_riscv_gstage_mode = HGATP_MODE_SV48X4;
-		kvm_riscv_gstage_pgd_levels = 4;
+		kvm_riscv_gstage_max_mode = HGATP_MODE_SV48X4;
+		kvm_riscv_gstage_max_pgd_levels = 4;
 		goto done;
 	}
 
 	/* Try Sv39x4 G-stage mode */
 	csr_write(CSR_HGATP, HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
 	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV39X4) {
-		kvm_riscv_gstage_mode = HGATP_MODE_SV39X4;
-		kvm_riscv_gstage_pgd_levels = 3;
+		kvm_riscv_gstage_max_mode = HGATP_MODE_SV39X4;
+		kvm_riscv_gstage_max_pgd_levels = 3;
 		goto done;
 	}
 #else /* CONFIG_32BIT */
 	/* Try Sv32x4 G-stage mode */
 	csr_write(CSR_HGATP, HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
 	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV32X4) {
-		kvm_riscv_gstage_mode = HGATP_MODE_SV32X4;
-		kvm_riscv_gstage_pgd_levels = 2;
+		kvm_riscv_gstage_max_mode = HGATP_MODE_SV32X4;
+		kvm_riscv_gstage_max_pgd_levels = 2;
 		goto done;
 	}
 #endif
 
 	/* KVM depends on !HGATP_MODE_OFF */
-	kvm_riscv_gstage_mode = HGATP_MODE_OFF;
-	kvm_riscv_gstage_pgd_levels = 0;
+	kvm_riscv_gstage_max_mode = HGATP_MODE_OFF;
+	kvm_riscv_gstage_max_pgd_levels = 0;
 
 done:
 	csr_write(CSR_HGATP, 0);
 	kvm_riscv_local_hfence_gvma_all();
 }
+
+unsigned long kvm_riscv_gstage_gpa_bits(struct kvm_arch *k) {
+	return (HGATP_PAGE_SHIFT + (k->kvm_riscv_gstage_pgd_levels *
+		    kvm_riscv_gstage_index_bits) +
+		    kvm_riscv_gstage_pgd_xbits);
+}
+
+gpa_t kvm_riscv_gstage_gpa_size(struct kvm_arch *k) {
+	return ((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits(k)));
+}
diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
index 45536af521f0..56a246e0e791 100644
--- a/arch/riscv/kvm/main.c
+++ b/arch/riscv/kvm/main.c
@@ -105,7 +105,7 @@ static int __init riscv_kvm_init(void)
 		return rc;
 
 	kvm_riscv_gstage_mode_detect();
-	switch (kvm_riscv_gstage_mode) {
+	switch (kvm_riscv_gstage_max_mode) {
 	case HGATP_MODE_SV32X4:
 		str = "Sv32x4";
 		break;
@@ -164,7 +164,7 @@ static int __init riscv_kvm_init(void)
 			 (rc) ? slist : "no features");
 	}
 
-	kvm_info("using %s G-stage page table format\n", str);
+	kvm_info("Max G-stage page table format %s \n", str);
 
 	kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits());
 
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 4ab06697bfc0..574783907162 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -67,7 +67,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
 		if (!writable)
 			map.pte = pte_wrprotect(map.pte);
 
-		ret = kvm_mmu_topup_memory_cache(&pcache, kvm_riscv_gstage_pgd_levels);
+		ret = kvm_mmu_topup_memory_cache(&pcache,kvm->arch.kvm_riscv_gstage_pgd_levels);
 		if (ret)
 			goto out;
 
@@ -186,8 +186,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	 * space addressable by the KVM guest GPA space.
 	 */
 	if ((new->base_gfn + new->npages) >=
-	    (kvm_riscv_gstage_gpa_size >> PAGE_SHIFT))
+			(kvm_riscv_gstage_gpa_size(&kvm->arch) >> PAGE_SHIFT)) {
 		return -EFAULT;
+	}
 
 	hva = new->userspace_addr;
 	size = new->npages << PAGE_SHIFT;
@@ -332,7 +333,7 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 	memset(out_map, 0, sizeof(*out_map));
 
 	/* We need minimum second+third level pages */
-	ret = kvm_mmu_topup_memory_cache(pcache, kvm_riscv_gstage_pgd_levels);
+	ret = kvm_mmu_topup_memory_cache(pcache, kvm->arch.kvm_riscv_gstage_pgd_levels);
 	if (ret) {
 		kvm_err("Failed to topup G-stage cache\n");
 		return ret;
@@ -431,6 +432,11 @@ int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm)
 		return -ENOMEM;
 	kvm->arch.pgd = page_to_virt(pgd_page);
 	kvm->arch.pgd_phys = page_to_phys(pgd_page);
+	if (!kvm->arch.gstage_mode_initialized) {
+		/*user-space didn't set KVM_CAP_RISC_HGATP_MODE cap*/
+		kvm->arch.kvm_riscv_gstage_mode = kvm_riscv_gstage_max_mode;
+		kvm->arch.kvm_riscv_gstage_pgd_levels = kvm_riscv_gstage_max_pgd_levels;
+	}
 
 	return 0;
 }
@@ -446,10 +452,12 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
 		gstage.flags = 0;
 		gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
 		gstage.pgd = kvm->arch.pgd;
-		kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false);
+		kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size(&kvm->arch), false);
 		pgd = READ_ONCE(kvm->arch.pgd);
 		kvm->arch.pgd = NULL;
 		kvm->arch.pgd_phys = 0;
+		kvm->arch.kvm_riscv_gstage_mode = HGATP_MODE_OFF;
+		kvm->arch.kvm_riscv_gstage_pgd_levels = 0;
 	}
 	spin_unlock(&kvm->mmu_lock);
 
@@ -459,8 +467,8 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
 
 void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu)
 {
-	unsigned long hgatp = kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
 	struct kvm_arch *k = &vcpu->kvm->arch;
+	unsigned long hgatp = k->kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
 
 	hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID;
 	hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN;
diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
index 66d91ae6e9b2..4b2156df40fc 100644
--- a/arch/riscv/kvm/vm.c
+++ b/arch/riscv/kvm/vm.c
@@ -200,7 +200,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		r = KVM_USER_MEM_SLOTS;
 		break;
 	case KVM_CAP_VM_GPA_BITS:
-		r = kvm_riscv_gstage_gpa_bits;
+		r = kvm_riscv_gstage_gpa_bits(&kvm->arch);
 		break;
 	default:
 		r = 0;
diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c
index cf34d448289d..db27430f111e 100644
--- a/arch/riscv/kvm/vmid.c
+++ b/arch/riscv/kvm/vmid.c
@@ -26,7 +26,7 @@ static DEFINE_SPINLOCK(vmid_lock);
 void __init kvm_riscv_gstage_vmid_detect(void)
 {
 	/* Figure-out number of VMID bits in HW */
-	csr_write(CSR_HGATP, (kvm_riscv_gstage_mode << HGATP_MODE_SHIFT) | HGATP_VMID);
+	csr_write(CSR_HGATP, (kvm_riscv_gstage_max_mode << HGATP_MODE_SHIFT) | HGATP_VMID);
 	vmid_bits = csr_read(CSR_HGATP);
 	vmid_bits = (vmid_bits & HGATP_VMID) >> HGATP_VMID_SHIFT;
 	vmid_bits = fls_long(vmid_bits);
-- 
2.50.1
Re: [PATCH v2] RISC-V: KVM: Support runtime configuration for per-VM's HGATP mode
Posted by Andrew Jones 3 weeks, 1 day ago
On Mon, Jan 05, 2026 at 10:32:31PM +0800, fangyu.yu@linux.alibaba.com wrote:
> From: Fangyu Yu <fangyu.yu@linux.alibaba.com>
> 
> Introduces two per-VM architecture-specific fields to support runtime
> configuration of the G-stage page table format:
> 
> - kvm->arch.kvm_riscv_gstage_mode: specifies the HGATP mode used by the
>   current VM;
> - kvm->arch.kvm_riscv_gstage_pgd_levels: the corresponding number of page
>   table levels for the selected mode.
> 
> These fields replace the previous global variables
> kvm_riscv_gstage_mode and kvm_riscv_gstage_pgd_levels, enabling different
> virtual machines to independently select their G-stage page table format
> instead of being forced to share the maximum mode detected by the kernel
> at boot time.
> 
> Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
> ---
>  arch/riscv/include/asm/kvm_gstage.h | 12 ++---
>  arch/riscv/include/asm/kvm_host.h   |  4 ++
>  arch/riscv/kvm/gstage.c             | 82 +++++++++++++++++------------
>  arch/riscv/kvm/main.c               |  4 +-
>  arch/riscv/kvm/mmu.c                | 18 +++++--
>  arch/riscv/kvm/vm.c                 |  2 +-
>  arch/riscv/kvm/vmid.c               |  2 +-
>  7 files changed, 74 insertions(+), 50 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
> index 595e2183173e..fdcada123b3f 100644
> --- a/arch/riscv/include/asm/kvm_gstage.h
> +++ b/arch/riscv/include/asm/kvm_gstage.h
> @@ -29,16 +29,11 @@ struct kvm_gstage_mapping {
>  #define kvm_riscv_gstage_index_bits	10
>  #endif
>  
> -extern unsigned long kvm_riscv_gstage_mode;
> -extern unsigned long kvm_riscv_gstage_pgd_levels;
> +extern unsigned long kvm_riscv_gstage_max_mode;
> +extern unsigned long kvm_riscv_gstage_max_pgd_levels;
>  
>  #define kvm_riscv_gstage_pgd_xbits	2
>  #define kvm_riscv_gstage_pgd_size	(1UL << (HGATP_PAGE_SHIFT + kvm_riscv_gstage_pgd_xbits))
> -#define kvm_riscv_gstage_gpa_bits	(HGATP_PAGE_SHIFT + \
> -					 (kvm_riscv_gstage_pgd_levels * \
> -					  kvm_riscv_gstage_index_bits) + \
> -					 kvm_riscv_gstage_pgd_xbits)
> -#define kvm_riscv_gstage_gpa_size	((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits))
>  
>  bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
>  			       pte_t **ptepp, u32 *ptep_level);
> @@ -69,4 +64,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
>  
>  void kvm_riscv_gstage_mode_detect(void);
>  
> +gpa_t kvm_riscv_gstage_gpa_size(struct kvm_arch *k);
> +unsigned long kvm_riscv_gstage_gpa_bits(struct kvm_arch *k);
> +
>  #endif
> diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
> index 24585304c02b..27ea8e8fd5b0 100644
> --- a/arch/riscv/include/asm/kvm_host.h
> +++ b/arch/riscv/include/asm/kvm_host.h
> @@ -103,6 +103,10 @@ struct kvm_arch {
>  
>  	/* KVM_CAP_RISCV_MP_STATE_RESET */
>  	bool mp_state_reset;
> +
> +	unsigned long kvm_riscv_gstage_mode;

There's a 1:1 mapping for mode/levels, so we don't need to track both.
Since mode is rarely used, then I think something like this would still
provide enough convenience without requiring the storage allocation.

 static inline unsigned long kvm_riscv_gstage_mode(struct kvm_gstage *gstage)
 {
     unsigned long modes[] = {
         [2] = HGATP_MODE_SV32X4,
         [3] = HGATP_MODE_SV39X4,
         [4] = HGATP_MODE_SV48X4,
         [5] = HGATP_MODE_SV57X4,
     };

     return modes[gstage->kvm->arch.kvm_riscv_gstage_pgd_levels];
 }

> +	unsigned long kvm_riscv_gstage_pgd_levels;
> +	bool gstage_mode_initialized;
>  };
>  
>  struct kvm_cpu_trap {
> diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
> index b67d60d722c2..06452e4c2ab2 100644
> --- a/arch/riscv/kvm/gstage.c
> +++ b/arch/riscv/kvm/gstage.c
> @@ -12,22 +12,23 @@
>  #include <asm/kvm_gstage.h>
>  
>  #ifdef CONFIG_64BIT
> -unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4;
> -unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3;
> +unsigned long kvm_riscv_gstage_max_mode __ro_after_init = HGATP_MODE_SV39X4;

With a kvm_riscv_gstage_mode() function we don't need
kvm_riscv_gstage_max_mode either.

> +unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 3;
>  #else
> -unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4;
> -unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2;
> +unsigned long kvm_riscv_gstage_max_mode __ro_after_init = HGATP_MODE_SV32X4;
> +unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 2;
>  #endif
>  
>  #define gstage_pte_leaf(__ptep)	\
>  	(pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
>  
> -static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
> +static inline unsigned long gstage_pte_index(struct kvm_gstage *gstage,
> +					     gpa_t addr, u32 level)
>  {
>  	unsigned long mask;
>  	unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level);
>  
> -	if (level == (kvm_riscv_gstage_pgd_levels - 1))
> +	if (level == (gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1))

nit: we can drop the unnecessary () while touching this line.

>  		mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1;
>  	else
>  		mask = PTRS_PER_PTE - 1;
> @@ -40,12 +41,13 @@ static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
>  	return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
>  }
>  
> -static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
> +static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long page_size,
> +				     u32 *out_level)
>  {
>  	u32 i;
>  	unsigned long psz = 1UL << 12;
>  
> -	for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) {
> +	for (i = 0; i < gstage->kvm->arch.kvm_riscv_gstage_pgd_levels; i++) {
>  		if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) {
>  			*out_level = i;
>  			return 0;
> @@ -55,21 +57,23 @@ static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
>  	return -EINVAL;
>  }
>  
> -static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
> +static int gstage_level_to_page_order(struct kvm_gstage *gstage, u32 level,
> +				      unsigned long *out_pgorder)
>  {
> -	if (kvm_riscv_gstage_pgd_levels < level)
> +	if (gstage->kvm->arch.kvm_riscv_gstage_pgd_levels < level)
>  		return -EINVAL;
>  
>  	*out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits);
>  	return 0;
>  }
>  
> -static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
> +static int gstage_level_to_page_size(struct kvm_gstage *gstage, u32 level,
> +				     unsigned long *out_pgsize)
>  {
>  	int rc;
>  	unsigned long page_order = PAGE_SHIFT;
>  
> -	rc = gstage_level_to_page_order(level, &page_order);
> +	rc = gstage_level_to_page_order(gstage, level, &page_order);
>  	if (rc)
>  		return rc;
>  
> @@ -81,11 +85,11 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
>  			       pte_t **ptepp, u32 *ptep_level)
>  {
>  	pte_t *ptep;
> -	u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
> +	u32 current_level = gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1;
>  
>  	*ptep_level = current_level;
>  	ptep = (pte_t *)gstage->pgd;
> -	ptep = &ptep[gstage_pte_index(addr, current_level)];
> +	ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
>  	while (ptep && pte_val(ptep_get(ptep))) {
>  		if (gstage_pte_leaf(ptep)) {
>  			*ptep_level = current_level;
> @@ -97,7 +101,7 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
>  			current_level--;
>  			*ptep_level = current_level;
>  			ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
> -			ptep = &ptep[gstage_pte_index(addr, current_level)];
> +			ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
>  		} else {
>  			ptep = NULL;
>  		}
> @@ -110,7 +114,7 @@ static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr)
>  {
>  	unsigned long order = PAGE_SHIFT;
>  
> -	if (gstage_level_to_page_order(level, &order))
> +	if (gstage_level_to_page_order(gstage, level, &order))
>  		return;
>  	addr &= ~(BIT(order) - 1);
>  
> @@ -125,9 +129,9 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
>  			     struct kvm_mmu_memory_cache *pcache,
>  			     const struct kvm_gstage_mapping *map)
>  {
> -	u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
> +	u32 current_level = gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1;
>  	pte_t *next_ptep = (pte_t *)gstage->pgd;
> -	pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
> +	pte_t *ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
>  
>  	if (current_level < map->level)
>  		return -EINVAL;
> @@ -151,7 +155,7 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
>  		}
>  
>  		current_level--;
> -		ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
> +		ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
>  	}
>  
>  	if (pte_val(*ptep) != pte_val(map->pte)) {
> @@ -175,7 +179,7 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
>  	out_map->addr = gpa;
>  	out_map->level = 0;
>  
> -	ret = gstage_page_size_to_level(page_size, &out_map->level);
> +	ret = gstage_page_size_to_level(gstage, page_size, &out_map->level);
>  	if (ret)
>  		return ret;
>  
> @@ -217,7 +221,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
>  	u32 next_ptep_level;
>  	unsigned long next_page_size, page_size;
>  
> -	ret = gstage_level_to_page_size(ptep_level, &page_size);
> +	ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
>  	if (ret)
>  		return;
>  
> @@ -229,7 +233,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
>  	if (ptep_level && !gstage_pte_leaf(ptep)) {
>  		next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
>  		next_ptep_level = ptep_level - 1;
> -		ret = gstage_level_to_page_size(next_ptep_level, &next_page_size);
> +		ret = gstage_level_to_page_size(gstage, next_ptep_level, &next_page_size);
>  		if (ret)
>  			return;
>  
> @@ -263,7 +267,7 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
>  
>  	while (addr < end) {
>  		found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
> -		ret = gstage_level_to_page_size(ptep_level, &page_size);
> +		ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
>  		if (ret)
>  			break;
>  
> @@ -297,7 +301,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
>  
>  	while (addr < end) {
>  		found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
> -		ret = gstage_level_to_page_size(ptep_level, &page_size);
> +		ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
>  		if (ret)
>  			break;
>  
> @@ -319,41 +323,51 @@ void __init kvm_riscv_gstage_mode_detect(void)
>  	/* Try Sv57x4 G-stage mode */
>  	csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
> -		kvm_riscv_gstage_mode = HGATP_MODE_SV57X4;
> -		kvm_riscv_gstage_pgd_levels = 5;
> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV57X4;
> +		kvm_riscv_gstage_max_pgd_levels = 5;
>  		goto done;
>  	}
>  
>  	/* Try Sv48x4 G-stage mode */
>  	csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
> -		kvm_riscv_gstage_mode = HGATP_MODE_SV48X4;
> -		kvm_riscv_gstage_pgd_levels = 4;
> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV48X4;
> +		kvm_riscv_gstage_max_pgd_levels = 4;
>  		goto done;
>  	}
>  
>  	/* Try Sv39x4 G-stage mode */
>  	csr_write(CSR_HGATP, HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV39X4) {
> -		kvm_riscv_gstage_mode = HGATP_MODE_SV39X4;
> -		kvm_riscv_gstage_pgd_levels = 3;
> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV39X4;
> +		kvm_riscv_gstage_max_pgd_levels = 3;
>  		goto done;
>  	}
>  #else /* CONFIG_32BIT */
>  	/* Try Sv32x4 G-stage mode */
>  	csr_write(CSR_HGATP, HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV32X4) {
> -		kvm_riscv_gstage_mode = HGATP_MODE_SV32X4;
> -		kvm_riscv_gstage_pgd_levels = 2;
> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV32X4;
> +		kvm_riscv_gstage_max_pgd_levels = 2;
>  		goto done;
>  	}
>  #endif
>  
>  	/* KVM depends on !HGATP_MODE_OFF */
> -	kvm_riscv_gstage_mode = HGATP_MODE_OFF;
> -	kvm_riscv_gstage_pgd_levels = 0;
> +	kvm_riscv_gstage_max_mode = HGATP_MODE_OFF;
> +	kvm_riscv_gstage_max_pgd_levels = 0;
>  
>  done:
>  	csr_write(CSR_HGATP, 0);
>  	kvm_riscv_local_hfence_gvma_all();
>  }
> +
> +unsigned long kvm_riscv_gstage_gpa_bits(struct kvm_arch *k) {

Did you run checkpatch? I think it requires '{' to be on its own line.

nit: s/k/ka/ would be consistent with other archs, although I see k is
used in riscv's kvm_riscv_mmu_update_hgatp() but that can be fixed up
in this patch since there's a change in the same place too.


> +	return (HGATP_PAGE_SHIFT + (k->kvm_riscv_gstage_pgd_levels *
> +		    kvm_riscv_gstage_index_bits) +
> +		    kvm_riscv_gstage_pgd_xbits);
> +}
> +
> +gpa_t kvm_riscv_gstage_gpa_size(struct kvm_arch *k) {

same comments as above

> +	return ((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits(k)));

 return BIT_ULL(kvm_riscv_gstage_gpa_bits(ka))

(the cast is implicit from return type)

> +}
> diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
> index 45536af521f0..56a246e0e791 100644
> --- a/arch/riscv/kvm/main.c
> +++ b/arch/riscv/kvm/main.c
> @@ -105,7 +105,7 @@ static int __init riscv_kvm_init(void)
>  		return rc;
>  
>  	kvm_riscv_gstage_mode_detect();
> -	switch (kvm_riscv_gstage_mode) {
> +	switch (kvm_riscv_gstage_max_mode) {
>  	case HGATP_MODE_SV32X4:
>  		str = "Sv32x4";
>  		break;
> @@ -164,7 +164,7 @@ static int __init riscv_kvm_init(void)
>  			 (rc) ? slist : "no features");
>  	}
>  
> -	kvm_info("using %s G-stage page table format\n", str);
> +	kvm_info("Max G-stage page table format %s \n", str);
>  
>  	kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits());
>  
> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> index 4ab06697bfc0..574783907162 100644
> --- a/arch/riscv/kvm/mmu.c
> +++ b/arch/riscv/kvm/mmu.c
> @@ -67,7 +67,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
>  		if (!writable)
>  			map.pte = pte_wrprotect(map.pte);
>  
> -		ret = kvm_mmu_topup_memory_cache(&pcache, kvm_riscv_gstage_pgd_levels);
> +		ret = kvm_mmu_topup_memory_cache(&pcache,kvm->arch.kvm_riscv_gstage_pgd_levels);
                                                         ^ missing space

>  		if (ret)
>  			goto out;
>  
> @@ -186,8 +186,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>  	 * space addressable by the KVM guest GPA space.
>  	 */
>  	if ((new->base_gfn + new->npages) >=
> -	    (kvm_riscv_gstage_gpa_size >> PAGE_SHIFT))
> +			(kvm_riscv_gstage_gpa_size(&kvm->arch) >> PAGE_SHIFT)) {
>  		return -EFAULT;
> +	}

nit: Remove the unnecessary () and the '{' and the condition will fit on
one 100 char line.

>  
>  	hva = new->userspace_addr;
>  	size = new->npages << PAGE_SHIFT;
> @@ -332,7 +333,7 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
>  	memset(out_map, 0, sizeof(*out_map));
>  
>  	/* We need minimum second+third level pages */
> -	ret = kvm_mmu_topup_memory_cache(pcache, kvm_riscv_gstage_pgd_levels);
> +	ret = kvm_mmu_topup_memory_cache(pcache, kvm->arch.kvm_riscv_gstage_pgd_levels);
>  	if (ret) {
>  		kvm_err("Failed to topup G-stage cache\n");
>  		return ret;
> @@ -431,6 +432,11 @@ int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm)
>  		return -ENOMEM;
>  	kvm->arch.pgd = page_to_virt(pgd_page);
>  	kvm->arch.pgd_phys = page_to_phys(pgd_page);
> +	if (!kvm->arch.gstage_mode_initialized) {
> +		/*user-space didn't set KVM_CAP_RISC_HGATP_MODE cap*/
                  ^ missing space                                  ^ missing space
> +		kvm->arch.kvm_riscv_gstage_mode = kvm_riscv_gstage_max_mode;
> +		kvm->arch.kvm_riscv_gstage_pgd_levels = kvm_riscv_gstage_max_pgd_levels;

Missing 'kvm->arch.gstage_mode_initialized = true' statement.

> +	}
>  
>  	return 0;
>  }
> @@ -446,10 +452,12 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
>  		gstage.flags = 0;
>  		gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
>  		gstage.pgd = kvm->arch.pgd;
> -		kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false);
> +		kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size(&kvm->arch), false);
>  		pgd = READ_ONCE(kvm->arch.pgd);
>  		kvm->arch.pgd = NULL;
>  		kvm->arch.pgd_phys = 0;
> +		kvm->arch.kvm_riscv_gstage_mode = HGATP_MODE_OFF;
> +		kvm->arch.kvm_riscv_gstage_pgd_levels = 0;
>  	}
>  	spin_unlock(&kvm->mmu_lock);
>  
> @@ -459,8 +467,8 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
>  
>  void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu)
>  {
> -	unsigned long hgatp = kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
>  	struct kvm_arch *k = &vcpu->kvm->arch;
> +	unsigned long hgatp = k->kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
>  
>  	hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID;
>  	hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN;
> diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
> index 66d91ae6e9b2..4b2156df40fc 100644
> --- a/arch/riscv/kvm/vm.c
> +++ b/arch/riscv/kvm/vm.c
> @@ -200,7 +200,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>  		r = KVM_USER_MEM_SLOTS;
>  		break;
>  	case KVM_CAP_VM_GPA_BITS:
> -		r = kvm_riscv_gstage_gpa_bits;
> +		r = kvm_riscv_gstage_gpa_bits(&kvm->arch);
>  		break;
>  	default:
>  		r = 0;
> diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c
> index cf34d448289d..db27430f111e 100644
> --- a/arch/riscv/kvm/vmid.c
> +++ b/arch/riscv/kvm/vmid.c
> @@ -26,7 +26,7 @@ static DEFINE_SPINLOCK(vmid_lock);
>  void __init kvm_riscv_gstage_vmid_detect(void)
>  {
>  	/* Figure-out number of VMID bits in HW */
> -	csr_write(CSR_HGATP, (kvm_riscv_gstage_mode << HGATP_MODE_SHIFT) | HGATP_VMID);
> +	csr_write(CSR_HGATP, (kvm_riscv_gstage_max_mode << HGATP_MODE_SHIFT) | HGATP_VMID);
>  	vmid_bits = csr_read(CSR_HGATP);
>  	vmid_bits = (vmid_bits & HGATP_VMID) >> HGATP_VMID_SHIFT;
>  	vmid_bits = fls_long(vmid_bits);
> -- 
> 2.50.1
>

Thanks,
drew
Re: Re: [PATCH v2] RISC-V: KVM: Support runtime configuration for per-VM's HGATP mode
Posted by fangyu.yu@linux.alibaba.com 3 weeks, 1 day ago
>> From: Fangyu Yu <fangyu.yu@linux.alibaba.com>
>>
>> Introduces two per-VM architecture-specific fields to support runtime
>> configuration of the G-stage page table format:
>>
>> - kvm->arch.kvm_riscv_gstage_mode: specifies the HGATP mode used by the
>>   current VM;
>> - kvm->arch.kvm_riscv_gstage_pgd_levels: the corresponding number of page
>>   table levels for the selected mode.
>>
>> These fields replace the previous global variables
>> kvm_riscv_gstage_mode and kvm_riscv_gstage_pgd_levels, enabling different
>> virtual machines to independently select their G-stage page table format
>> instead of being forced to share the maximum mode detected by the kernel
>> at boot time.
>>
>> Signed-off-by: Fangyu Yu <fangyu.yu@linux.alibaba.com>
>> ---
>>  arch/riscv/include/asm/kvm_gstage.h | 12 ++---
>>  arch/riscv/include/asm/kvm_host.h   |  4 ++
>>  arch/riscv/kvm/gstage.c             | 82 +++++++++++++++++------------
>>  arch/riscv/kvm/main.c               |  4 +-
>>  arch/riscv/kvm/mmu.c                | 18 +++++--
>>  arch/riscv/kvm/vm.c                 |  2 +-
>>  arch/riscv/kvm/vmid.c               |  2 +-
>>  7 files changed, 74 insertions(+), 50 deletions(-)
>>
>> diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
>> index 595e2183173e..fdcada123b3f 100644
>> --- a/arch/riscv/include/asm/kvm_gstage.h
>> +++ b/arch/riscv/include/asm/kvm_gstage.h
>> @@ -29,16 +29,11 @@ struct kvm_gstage_mapping {
>>  #define kvm_riscv_gstage_index_bits	10
>>  #endif
>>
>> -extern unsigned long kvm_riscv_gstage_mode;
>> -extern unsigned long kvm_riscv_gstage_pgd_levels;
>> +extern unsigned long kvm_riscv_gstage_max_mode;
>> +extern unsigned long kvm_riscv_gstage_max_pgd_levels;
>>
>>  #define kvm_riscv_gstage_pgd_xbits	2
>>  #define kvm_riscv_gstage_pgd_size	(1UL << (HGATP_PAGE_SHIFT + kvm_riscv_gstage_pgd_xbits))
>> -#define kvm_riscv_gstage_gpa_bits	(HGATP_PAGE_SHIFT + \
>> -					 (kvm_riscv_gstage_pgd_levels * \
>> -					  kvm_riscv_gstage_index_bits) + \
>> -					 kvm_riscv_gstage_pgd_xbits)
>> -#define kvm_riscv_gstage_gpa_size	((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits))
>>
>>  bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
>>  			       pte_t **ptepp, u32 *ptep_level);
>> @@ -69,4 +64,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
>>
>>  void kvm_riscv_gstage_mode_detect(void);
>>
>> +gpa_t kvm_riscv_gstage_gpa_size(struct kvm_arch *k);
>> +unsigned long kvm_riscv_gstage_gpa_bits(struct kvm_arch *k);
>> +
>>  #endif
>> diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h
>> index 24585304c02b..27ea8e8fd5b0 100644
>> --- a/arch/riscv/include/asm/kvm_host.h
>> +++ b/arch/riscv/include/asm/kvm_host.h
>> @@ -103,6 +103,10 @@ struct kvm_arch {
>>
>>  	/* KVM_CAP_RISCV_MP_STATE_RESET */
>>  	bool mp_state_reset;
>> +
>> +	unsigned long kvm_riscv_gstage_mode;
>
>There's a 1:1 mapping for mode/levels, so we don't need to track both.
>Since mode is rarely used, then I think something like this would still
>provide enough convenience without requiring the storage allocation.
>
> static inline unsigned long kvm_riscv_gstage_mode(struct kvm_gstage *gstage)
> {
>     unsigned long modes[] = {
>         [2] = HGATP_MODE_SV32X4,
>         [3] = HGATP_MODE_SV39X4,
>         [4] = HGATP_MODE_SV48X4,
>         [5] = HGATP_MODE_SV57X4,
>     };
>
>     return modes[gstage->kvm->arch.kvm_riscv_gstage_pgd_levels];
> }

Thanks for the suggestion.

You're right that gstage mode has a 1:1 mapping with pgd_levels, so keeping
both is redundant. In the next revision I'll drop kvm_riscv_gstage_mode and
derive HGATP.MODE from kvm_riscv_gstage_pgd_levels via a small helper.

>> +	unsigned long kvm_riscv_gstage_pgd_levels;
>> +	bool gstage_mode_initialized;
>>  };
>>
>>  struct kvm_cpu_trap {
>> diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
>> index b67d60d722c2..06452e4c2ab2 100644
>> --- a/arch/riscv/kvm/gstage.c
>> +++ b/arch/riscv/kvm/gstage.c
>> @@ -12,22 +12,23 @@
>>  #include <asm/kvm_gstage.h>
>>
>>  #ifdef CONFIG_64BIT
>> -unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV39X4;
>> -unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 3;
>> +unsigned long kvm_riscv_gstage_max_mode __ro_after_init = HGATP_MODE_SV39X4;
>
>With a kvm_riscv_gstage_mode() function we don't need
>kvm_riscv_gstage_max_mode either.

Thanks, agreed.

>> +unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 3;
>>  #else
>> -unsigned long kvm_riscv_gstage_mode __ro_after_init = HGATP_MODE_SV32X4;
>> -unsigned long kvm_riscv_gstage_pgd_levels __ro_after_init = 2;
>> +unsigned long kvm_riscv_gstage_max_mode __ro_after_init = HGATP_MODE_SV32X4;
>> +unsigned long kvm_riscv_gstage_max_pgd_levels __ro_after_init = 2;
>>  #endif
>>
>>  #define gstage_pte_leaf(__ptep)	\
>>  	(pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC))
>>
>> -static inline unsigned long gstage_pte_index(gpa_t addr, u32 level)
>> +static inline unsigned long gstage_pte_index(struct kvm_gstage *gstage,
>> +					     gpa_t addr, u32 level)
>>  {
>>  	unsigned long mask;
>>  	unsigned long shift = HGATP_PAGE_SHIFT + (kvm_riscv_gstage_index_bits * level);
>>
>> -	if (level == (kvm_riscv_gstage_pgd_levels - 1))
>> +	if (level == (gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1))
>
>nit: we can drop the unnecessary () while touching this line.

Ack, will fix.

>>  		mask = (PTRS_PER_PTE * (1UL << kvm_riscv_gstage_pgd_xbits)) - 1;
>>  	else
>>  		mask = PTRS_PER_PTE - 1;
>> @@ -40,12 +41,13 @@ static inline unsigned long gstage_pte_page_vaddr(pte_t pte)
>>  	return (unsigned long)pfn_to_virt(__page_val_to_pfn(pte_val(pte)));
>>  }
>>
>> -static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
>> +static int gstage_page_size_to_level(struct kvm_gstage *gstage, unsigned long page_size,
>> +				     u32 *out_level)
>>  {
>>  	u32 i;
>>  	unsigned long psz = 1UL << 12;
>>
>> -	for (i = 0; i < kvm_riscv_gstage_pgd_levels; i++) {
>> +	for (i = 0; i < gstage->kvm->arch.kvm_riscv_gstage_pgd_levels; i++) {
>>  		if (page_size == (psz << (i * kvm_riscv_gstage_index_bits))) {
>>  			*out_level = i;
>>  			return 0;
>> @@ -55,21 +57,23 @@ static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level)
>>  	return -EINVAL;
>>  }
>>
>> -static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder)
>> +static int gstage_level_to_page_order(struct kvm_gstage *gstage, u32 level,
>> +				      unsigned long *out_pgorder)
>>  {
>> -	if (kvm_riscv_gstage_pgd_levels < level)
>> +	if (gstage->kvm->arch.kvm_riscv_gstage_pgd_levels < level)
>>  		return -EINVAL;
>>
>>  	*out_pgorder = 12 + (level * kvm_riscv_gstage_index_bits);
>>  	return 0;
>>  }
>>
>> -static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize)
>> +static int gstage_level_to_page_size(struct kvm_gstage *gstage, u32 level,
>> +				     unsigned long *out_pgsize)
>>  {
>>  	int rc;
>>  	unsigned long page_order = PAGE_SHIFT;
>>
>> -	rc = gstage_level_to_page_order(level, &page_order);
>> +	rc = gstage_level_to_page_order(gstage, level, &page_order);
>>  	if (rc)
>>  		return rc;
>>
>> @@ -81,11 +85,11 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
>>  			       pte_t **ptepp, u32 *ptep_level)
>>  {
>>  	pte_t *ptep;
>> -	u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
>> +	u32 current_level = gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1;
>>
>>  	*ptep_level = current_level;
>>  	ptep = (pte_t *)gstage->pgd;
>> -	ptep = &ptep[gstage_pte_index(addr, current_level)];
>> +	ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
>>  	while (ptep && pte_val(ptep_get(ptep))) {
>>  		if (gstage_pte_leaf(ptep)) {
>>  			*ptep_level = current_level;
>> @@ -97,7 +101,7 @@ bool kvm_riscv_gstage_get_leaf(struct kvm_gstage *gstage, gpa_t addr,
>>  			current_level--;
>>  			*ptep_level = current_level;
>>  			ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
>> -			ptep = &ptep[gstage_pte_index(addr, current_level)];
>> +			ptep = &ptep[gstage_pte_index(gstage, addr, current_level)];
>>  		} else {
>>  			ptep = NULL;
>>  		}
>> @@ -110,7 +114,7 @@ static void gstage_tlb_flush(struct kvm_gstage *gstage, u32 level, gpa_t addr)
>>  {
>>  	unsigned long order = PAGE_SHIFT;
>>
>> -	if (gstage_level_to_page_order(level, &order))
>> +	if (gstage_level_to_page_order(gstage, level, &order))
>>  		return;
>>  	addr &= ~(BIT(order) - 1);
>>
>> @@ -125,9 +129,9 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
>>  			     struct kvm_mmu_memory_cache *pcache,
>>  			     const struct kvm_gstage_mapping *map)
>>  {
>> -	u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
>> +	u32 current_level = gstage->kvm->arch.kvm_riscv_gstage_pgd_levels - 1;
>>  	pte_t *next_ptep = (pte_t *)gstage->pgd;
>> -	pte_t *ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
>> +	pte_t *ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
>>
>>  	if (current_level < map->level)
>>  		return -EINVAL;
>> @@ -151,7 +155,7 @@ int kvm_riscv_gstage_set_pte(struct kvm_gstage *gstage,
>>  		}
>>
>>  		current_level--;
>> -		ptep = &next_ptep[gstage_pte_index(map->addr, current_level)];
>> +		ptep = &next_ptep[gstage_pte_index(gstage, map->addr, current_level)];
>>  	}
>>
>>  	if (pte_val(*ptep) != pte_val(map->pte)) {
>> @@ -175,7 +179,7 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
>>  	out_map->addr = gpa;
>>  	out_map->level = 0;
>>
>> -	ret = gstage_page_size_to_level(page_size, &out_map->level);
>> +	ret = gstage_page_size_to_level(gstage, page_size, &out_map->level);
>>  	if (ret)
>>  		return ret;
>>
>> @@ -217,7 +221,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
>>  	u32 next_ptep_level;
>>  	unsigned long next_page_size, page_size;
>>
>> -	ret = gstage_level_to_page_size(ptep_level, &page_size);
>> +	ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
>>  	if (ret)
>>  		return;
>>
>> @@ -229,7 +233,7 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
>>  	if (ptep_level && !gstage_pte_leaf(ptep)) {
>>  		next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
>>  		next_ptep_level = ptep_level - 1;
>> -		ret = gstage_level_to_page_size(next_ptep_level, &next_page_size);
>> +		ret = gstage_level_to_page_size(gstage, next_ptep_level, &next_page_size);
>>  		if (ret)
>>  			return;
>>
>> @@ -263,7 +267,7 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
>>
>>  	while (addr < end) {
>>  		found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
>> -		ret = gstage_level_to_page_size(ptep_level, &page_size);
>> +		ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
>>  		if (ret)
>>  			break;
>>
>> @@ -297,7 +301,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
>>
>>  	while (addr < end) {
>>  		found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
>> -		ret = gstage_level_to_page_size(ptep_level, &page_size);
>> +		ret = gstage_level_to_page_size(gstage, ptep_level, &page_size);
>>  		if (ret)
>>  			break;
>>
>> @@ -319,41 +323,51 @@ void __init kvm_riscv_gstage_mode_detect(void)
>>  	/* Try Sv57x4 G-stage mode */
>>  	csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT);
>>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) {
>> -		kvm_riscv_gstage_mode = HGATP_MODE_SV57X4;
>> -		kvm_riscv_gstage_pgd_levels = 5;
>> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV57X4;
>> +		kvm_riscv_gstage_max_pgd_levels = 5;
>>  		goto done;
>>  	}
>>
>>  	/* Try Sv48x4 G-stage mode */
>>  	csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT);
>>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) {
>> -		kvm_riscv_gstage_mode = HGATP_MODE_SV48X4;
>> -		kvm_riscv_gstage_pgd_levels = 4;
>> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV48X4;
>> +		kvm_riscv_gstage_max_pgd_levels = 4;
>>  		goto done;
>>  	}
>>
>>  	/* Try Sv39x4 G-stage mode */
>>  	csr_write(CSR_HGATP, HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT);
>>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV39X4) {
>> -		kvm_riscv_gstage_mode = HGATP_MODE_SV39X4;
>> -		kvm_riscv_gstage_pgd_levels = 3;
>> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV39X4;
>> +		kvm_riscv_gstage_max_pgd_levels = 3;
>>  		goto done;
>>  	}
>>  #else /* CONFIG_32BIT */
>>  	/* Try Sv32x4 G-stage mode */
>>  	csr_write(CSR_HGATP, HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT);
>>  	if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV32X4) {
>> -		kvm_riscv_gstage_mode = HGATP_MODE_SV32X4;
>> -		kvm_riscv_gstage_pgd_levels = 2;
>> +		kvm_riscv_gstage_max_mode = HGATP_MODE_SV32X4;
>> +		kvm_riscv_gstage_max_pgd_levels = 2;
>>  		goto done;
>>  	}
>>  #endif
>>
>>  	/* KVM depends on !HGATP_MODE_OFF */
>> -	kvm_riscv_gstage_mode = HGATP_MODE_OFF;
>> -	kvm_riscv_gstage_pgd_levels = 0;
>> +	kvm_riscv_gstage_max_mode = HGATP_MODE_OFF;
>> +	kvm_riscv_gstage_max_pgd_levels = 0;
>>
>>  done:
>>  	csr_write(CSR_HGATP, 0);
>>  	kvm_riscv_local_hfence_gvma_all();
>>  }
>> +
>> +unsigned long kvm_riscv_gstage_gpa_bits(struct kvm_arch *k) {
>
>Did you run checkpatch? I think it requires '{' to be on its own line.
>
>nit: s/k/ka/ would be consistent with other archs, although I see k is
>used in riscv's kvm_riscv_mmu_update_hgatp() but that can be fixed up
>in this patch since there's a change in the same place too.

Thanks for catching that.

Yes, checkpatch complains about the opening brace placement here. I'll fix
the style by moving '{' onto its own line. I'll also rename the argument
from 'k' to 'ka' for consistency (and update the existing usage in
kvm_riscv_mmu_update_hgatp() in the same patch since we're touching it
anyway).

I'll apply the same fixes to kvm_riscv_gstage_gpa_size() as well.

>
>> +	return (HGATP_PAGE_SHIFT + (k->kvm_riscv_gstage_pgd_levels *
>> +		    kvm_riscv_gstage_index_bits) +
>> +		    kvm_riscv_gstage_pgd_xbits);
>> +}
>> +
>> +gpa_t kvm_riscv_gstage_gpa_size(struct kvm_arch *k) {
>
>same comments as above
>
>> +	return ((gpa_t)(1ULL << kvm_riscv_gstage_gpa_bits(k)));
>
> return BIT_ULL(kvm_riscv_gstage_gpa_bits(ka))
>
>(the cast is implicit from return type)
>
>> +}
>> diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c
>> index 45536af521f0..56a246e0e791 100644
>> --- a/arch/riscv/kvm/main.c
>> +++ b/arch/riscv/kvm/main.c
>> @@ -105,7 +105,7 @@ static int __init riscv_kvm_init(void)
>>  		return rc;
>>
>>  	kvm_riscv_gstage_mode_detect();
>> -	switch (kvm_riscv_gstage_mode) {
>> +	switch (kvm_riscv_gstage_max_mode) {
>>  	case HGATP_MODE_SV32X4:
>>  		str = "Sv32x4";
>>  		break;
>> @@ -164,7 +164,7 @@ static int __init riscv_kvm_init(void)
>>  			 (rc) ? slist : "no features");
>>  	}
>>
>> -	kvm_info("using %s G-stage page table format\n", str);
>> +	kvm_info("Max G-stage page table format %s \n", str);
>>
>>  	kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits());
>>
>> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
>> index 4ab06697bfc0..574783907162 100644
>> --- a/arch/riscv/kvm/mmu.c
>> +++ b/arch/riscv/kvm/mmu.c
>> @@ -67,7 +67,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
>>  		if (!writable)
>>  			map.pte = pte_wrprotect(map.pte);
>>
>> -		ret = kvm_mmu_topup_memory_cache(&pcache, kvm_riscv_gstage_pgd_levels);
>> +		ret = kvm_mmu_topup_memory_cache(&pcache,kvm->arch.kvm_riscv_gstage_pgd_levels);
>                                                         ^ missing space
>
>>  		if (ret)
>>  			goto out;
>>
>> @@ -186,8 +186,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
>>  	 * space addressable by the KVM guest GPA space.
>>  	 */
>>  	if ((new->base_gfn + new->npages) >=
>> -	    (kvm_riscv_gstage_gpa_size >> PAGE_SHIFT))
>> +			(kvm_riscv_gstage_gpa_size(&kvm->arch) >> PAGE_SHIFT)) {
>>  		return -EFAULT;
>> +	}
>
>nit: Remove the unnecessary () and the '{' and the condition will fit on
>one 100 char line.

Ack.

>>
>>  	hva = new->userspace_addr;
>>  	size = new->npages << PAGE_SHIFT;
>> @@ -332,7 +333,7 @@ int kvm_riscv_mmu_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
>>  	memset(out_map, 0, sizeof(*out_map));
>>
>>  	/* We need minimum second+third level pages */
>> -	ret = kvm_mmu_topup_memory_cache(pcache, kvm_riscv_gstage_pgd_levels);
>> +	ret = kvm_mmu_topup_memory_cache(pcache, kvm->arch.kvm_riscv_gstage_pgd_levels);
>>  	if (ret) {
>>  		kvm_err("Failed to topup G-stage cache\n");
>>  		return ret;
>> @@ -431,6 +432,11 @@ int kvm_riscv_mmu_alloc_pgd(struct kvm *kvm)
>>  		return -ENOMEM;
>>  	kvm->arch.pgd = page_to_virt(pgd_page);
>>  	kvm->arch.pgd_phys = page_to_phys(pgd_page);
>> +	if (!kvm->arch.gstage_mode_initialized) {
>> +		/*user-space didn't set KVM_CAP_RISC_HGATP_MODE cap*/
>                  ^ missing space                                  ^ missing space
>> +		kvm->arch.kvm_riscv_gstage_mode = kvm_riscv_gstage_max_mode;
>> +		kvm->arch.kvm_riscv_gstage_pgd_levels = kvm_riscv_gstage_max_pgd_levels;
>
>Missing 'kvm->arch.gstage_mode_initialized = true' statement.

The initialization is done in the following commit of this series (patch 2/2)

>> +	}
>>
>>  	return 0;
>>  }
>> @@ -446,10 +452,12 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
>>  		gstage.flags = 0;
>>  		gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
>>  		gstage.pgd = kvm->arch.pgd;
>> -		kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false);
>> +		kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size(&kvm->arch), false);
>>  		pgd = READ_ONCE(kvm->arch.pgd);
>>  		kvm->arch.pgd = NULL;
>>  		kvm->arch.pgd_phys = 0;
>> +		kvm->arch.kvm_riscv_gstage_mode = HGATP_MODE_OFF;
>> +		kvm->arch.kvm_riscv_gstage_pgd_levels = 0;
>>  	}
>>  	spin_unlock(&kvm->mmu_lock);
>>
>> @@ -459,8 +467,8 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
>>
>>  void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu)
>>  {
>> -	unsigned long hgatp = kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
>>  	struct kvm_arch *k = &vcpu->kvm->arch;
>> +	unsigned long hgatp = k->kvm_riscv_gstage_mode << HGATP_MODE_SHIFT;
>>
>>  	hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & HGATP_VMID;
>>  	hgatp |= (k->pgd_phys >> PAGE_SHIFT) & HGATP_PPN;
>> diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c
>> index 66d91ae6e9b2..4b2156df40fc 100644
>> --- a/arch/riscv/kvm/vm.c
>> +++ b/arch/riscv/kvm/vm.c
>> @@ -200,7 +200,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
>>  		r = KVM_USER_MEM_SLOTS;
>>  		break;
>>  	case KVM_CAP_VM_GPA_BITS:
>> -		r = kvm_riscv_gstage_gpa_bits;
>> +		r = kvm_riscv_gstage_gpa_bits(&kvm->arch);
>>  		break;
>>  	default:
>>  		r = 0;
>> diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c
>> index cf34d448289d..db27430f111e 100644
>> --- a/arch/riscv/kvm/vmid.c
>> +++ b/arch/riscv/kvm/vmid.c
>> @@ -26,7 +26,7 @@ static DEFINE_SPINLOCK(vmid_lock);
>>  void __init kvm_riscv_gstage_vmid_detect(void)
>>  {
>>  	/* Figure-out number of VMID bits in HW */
>> -	csr_write(CSR_HGATP, (kvm_riscv_gstage_mode << HGATP_MODE_SHIFT) | HGATP_VMID);
>> +	csr_write(CSR_HGATP, (kvm_riscv_gstage_max_mode << HGATP_MODE_SHIFT) | HGATP_VMID);
>>  	vmid_bits = csr_read(CSR_HGATP);
>>  	vmid_bits = (vmid_bits & HGATP_VMID) >> HGATP_VMID_SHIFT;
>>  	vmid_bits = fls_long(vmid_bits);
>> --
>> 2.50.1
>>
>
>Thanks,
>drew
>
Thanks,
Fangyu