KVM: arm64: Add "struct kvm_page_fault"

[RFC PATCH 05/16] KVM: arm64: Introduce "struct kvm_page_fault" for tracking abort state

Posted by Sean Christopherson 1 month, 1 week ago

Add and use a kvm_page_fault structure to track state when handling a
guest abort.  Collecting everything in a single structure will enable a
variety of cleanups (reduce the number of params passed to helpers), and
will pave the way toward using "struct kvm_page_fault" in arch-neutral KVM
code, e.g. to consolidate logic for KVM_EXIT_MEMORY_FAULT.

No functional change intended.

Cc: James Houghton <jthoughton@google.com>
Link: https://lore.kernel.org/all/20250618042424.330664-1-jthoughton@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/arm64/include/asm/kvm_host.h |  18 ++++
 arch/arm64/kvm/mmu.c              | 143 ++++++++++++++----------------
 2 files changed, 87 insertions(+), 74 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 2f2394cce24e..4623cbc1edf4 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -413,6 +413,24 @@ struct kvm_vcpu_fault_info {
 	u64 disr_el1;		/* Deferred [SError] Status Register */
 };
 
+struct kvm_page_fault {
+	const u64 esr;
+	const bool exec;
+	const bool write;
+	const bool is_perm;
+
+	phys_addr_t fault_ipa; /* The address we faulted on */
+	phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
+
+	struct kvm_s2_trans *nested;
+
+	gfn_t gfn;
+	struct kvm_memory_slot *slot;
+	unsigned long hva;
+	kvm_pfn_t pfn;
+	struct page *page;
+};
+
 /*
  * VNCR() just places the VNCR_capable registers in the enum after
  * __VNCR_START__, and the value (after correction) to be an 8-byte offset
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 49ce6bf623f7..ca98778989b2 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1477,38 +1477,29 @@ static bool kvm_vma_is_cacheable(struct vm_area_struct *vma)
 	}
 }
 
-static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
-			  struct kvm_s2_trans *nested,
-			  struct kvm_memory_slot *memslot, unsigned long hva,
-			  bool fault_is_perm)
+static int user_mem_abort(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
 	int ret = 0;
-	bool write_fault, writable, force_pte = false;
-	bool exec_fault, mte_allowed, is_vma_cacheable;
+	bool writable, force_pte = false;
+	bool mte_allowed, is_vma_cacheable;
 	bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
 	unsigned long mmu_seq;
-	phys_addr_t ipa = fault_ipa;
 	struct kvm *kvm = vcpu->kvm;
 	struct vm_area_struct *vma;
 	short vma_shift;
 	void *memcache;
-	gfn_t gfn;
-	kvm_pfn_t pfn;
-	bool logging_active = memslot_is_logging(memslot);
+	bool logging_active = memslot_is_logging(fault->slot);
 	long vma_pagesize, fault_granule;
 	enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
 	struct kvm_pgtable *pgt;
-	struct page *page;
 	vm_flags_t vm_flags;
 	enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED;
 
-	if (fault_is_perm)
+	if (fault->is_perm)
 		fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
-	write_fault = kvm_is_write_fault(vcpu);
-	exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
-	VM_BUG_ON(write_fault && exec_fault);
+	VM_BUG_ON(fault->write && fault->exec);
 
-	if (fault_is_perm && !write_fault && !exec_fault) {
+	if (fault->is_perm && !fault->write && !fault->exec) {
 		kvm_err("Unexpected L2 read permission error\n");
 		return -EFAULT;
 	}
@@ -1524,7 +1515,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	 * only exception to this is when dirty logging is enabled at runtime
 	 * and a write fault needs to collapse a block entry into a table.
 	 */
-	if (!fault_is_perm || (logging_active && write_fault)) {
+	if (!fault->is_perm || (logging_active && fault->write)) {
 		int min_pages = kvm_mmu_cache_min_pages(vcpu->arch.hw_mmu);
 
 		if (!is_protected_kvm_enabled())
@@ -1541,9 +1532,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	 * get block mapping for device MMIO region.
 	 */
 	mmap_read_lock(current->mm);
-	vma = vma_lookup(current->mm, hva);
+	vma = vma_lookup(current->mm, fault->hva);
 	if (unlikely(!vma)) {
-		kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
+		kvm_err("Failed to find VMA for hva 0x%lx\n", fault->hva);
 		mmap_read_unlock(current->mm);
 		return -EFAULT;
 	}
@@ -1556,13 +1547,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		force_pte = true;
 		vma_shift = PAGE_SHIFT;
 	} else {
-		vma_shift = get_vma_page_shift(vma, hva);
+		vma_shift = get_vma_page_shift(vma, fault->hva);
 	}
 
 	switch (vma_shift) {
 #ifndef __PAGETABLE_PMD_FOLDED
 	case PUD_SHIFT:
-		if (fault_supports_stage2_huge_mapping(memslot, hva, PUD_SIZE))
+		if (fault_supports_stage2_huge_mapping(fault->slot, fault->hva, PUD_SIZE))
 			break;
 		fallthrough;
 #endif
@@ -1570,7 +1561,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		vma_shift = PMD_SHIFT;
 		fallthrough;
 	case PMD_SHIFT:
-		if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE))
+		if (fault_supports_stage2_huge_mapping(fault->slot, fault->hva, PMD_SIZE))
 			break;
 		fallthrough;
 	case CONT_PTE_SHIFT:
@@ -1585,19 +1576,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 	vma_pagesize = 1UL << vma_shift;
 
-	if (nested) {
+	if (fault->nested) {
 		unsigned long max_map_size;
 
 		max_map_size = force_pte ? PAGE_SIZE : PUD_SIZE;
 
-		ipa = kvm_s2_trans_output(nested);
+		WARN_ON_ONCE(fault->ipa != kvm_s2_trans_output(fault->nested));
 
 		/*
 		 * If we're about to create a shadow stage 2 entry, then we
 		 * can only create a block mapping if the guest stage 2 page
 		 * table uses at least as big a mapping.
 		 */
-		max_map_size = min(kvm_s2_trans_size(nested), max_map_size);
+		max_map_size = min(kvm_s2_trans_size(fault->nested), max_map_size);
 
 		/*
 		 * Be careful that if the mapping size falls between
@@ -1618,11 +1609,11 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	 * place.
 	 */
 	if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE) {
-		fault_ipa &= ~(vma_pagesize - 1);
-		ipa &= ~(vma_pagesize - 1);
+		fault->fault_ipa &= ~(vma_pagesize - 1);
+		fault->ipa &= ~(vma_pagesize - 1);
 	}
 
-	gfn = ipa >> PAGE_SHIFT;
+	fault->gfn = fault->ipa >> PAGE_SHIFT;
 	mte_allowed = kvm_vma_mte_allowed(vma);
 
 	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
@@ -1645,20 +1636,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	mmu_seq = vcpu->kvm->mmu_invalidate_seq;
 	mmap_read_unlock(current->mm);
 
-	pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
-				&writable, &page);
-	if (pfn == KVM_PFN_ERR_HWPOISON) {
-		kvm_send_hwpoison_signal(hva, vma_shift);
+	fault->pfn = __kvm_faultin_pfn(fault->slot, fault->gfn,
+				       fault->write ? FOLL_WRITE : 0,
+				       &writable, &fault->page);
+	if (fault->pfn == KVM_PFN_ERR_HWPOISON) {
+		kvm_send_hwpoison_signal(fault->hva, vma_shift);
 		return 0;
 	}
-	if (is_error_noslot_pfn(pfn))
+	if (is_error_noslot_pfn(fault->pfn))
 		return -EFAULT;
 
 	/*
 	 * Check if this is non-struct page memory PFN, and cannot support
 	 * CMOs. It could potentially be unsafe to access as cachable.
 	 */
-	if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) {
+	if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(fault->pfn)) {
 		if (is_vma_cacheable) {
 			/*
 			 * Whilst the VMA owner expects cacheable mapping to this
@@ -1687,7 +1679,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			 */
 			s2_force_noncacheable = true;
 		}
-	} else if (logging_active && !write_fault) {
+	} else if (logging_active && !fault->write) {
 		/*
 		 * Only actually map the page as writable if this was a write
 		 * fault.
@@ -1695,7 +1687,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		writable = false;
 	}
 
-	if (exec_fault && s2_force_noncacheable)
+	if (fault->exec && s2_force_noncacheable)
 		return -ENOEXEC;
 
 	/*
@@ -1709,12 +1701,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	 * used to limit the invalidation scope if a TTL hint or a range
 	 * isn't provided.
 	 */
-	if (nested) {
-		writable &= kvm_s2_trans_writable(nested);
-		if (!kvm_s2_trans_readable(nested))
+	if (fault->nested) {
+		writable &= kvm_s2_trans_writable(fault->nested);
+		if (!kvm_s2_trans_readable(fault->nested))
 			prot &= ~KVM_PGTABLE_PROT_R;
 
-		prot |= kvm_encode_nested_level(nested);
+		prot |= kvm_encode_nested_level(fault->nested);
 	}
 
 	kvm_fault_lock(kvm);
@@ -1729,12 +1721,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	 * backed by a THP and thus use block mapping if possible.
 	 */
 	if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) {
-		if (fault_is_perm && fault_granule > PAGE_SIZE)
+		if (fault->is_perm && fault_granule > PAGE_SIZE)
 			vma_pagesize = fault_granule;
 		else
-			vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
-								   hva, &pfn,
-								   &fault_ipa);
+			vma_pagesize = transparent_hugepage_adjust(kvm, fault->slot,
+								   fault->hva, &fault->pfn,
+								   &fault->fault_ipa);
 
 		if (vma_pagesize < 0) {
 			ret = vma_pagesize;
@@ -1742,10 +1734,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		}
 	}
 
-	if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) {
+	if (!fault->is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) {
 		/* Check the VMM hasn't introduced a new disallowed VMA */
 		if (mte_allowed) {
-			sanitise_mte_tags(kvm, pfn, vma_pagesize);
+			sanitise_mte_tags(kvm, fault->pfn, vma_pagesize);
 		} else {
 			ret = -EFAULT;
 			goto out_unlock;
@@ -1755,7 +1747,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (writable)
 		prot |= KVM_PGTABLE_PROT_W;
 
-	if (exec_fault)
+	if (fault->exec)
 		prot |= KVM_PGTABLE_PROT_X;
 
 	if (s2_force_noncacheable) {
@@ -1764,7 +1756,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		else
 			prot |= KVM_PGTABLE_PROT_DEVICE;
 	} else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
-		   (!nested || kvm_s2_trans_executable(nested))) {
+		   (!fault->nested || kvm_s2_trans_executable(fault->nested))) {
 		prot |= KVM_PGTABLE_PROT_X;
 	}
 
@@ -1773,26 +1765,26 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	 * permissions only if vma_pagesize equals fault_granule. Otherwise,
 	 * kvm_pgtable_stage2_map() should be called to change block size.
 	 */
-	if (fault_is_perm && vma_pagesize == fault_granule) {
+	if (fault->is_perm && vma_pagesize == fault_granule) {
 		/*
 		 * Drop the SW bits in favour of those stored in the
 		 * PTE, which will be preserved.
 		 */
 		prot &= ~KVM_NV_GUEST_MAP_SZ;
-		ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags);
+		ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault->fault_ipa, prot, flags);
 	} else {
-		ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
-					     __pfn_to_phys(pfn), prot,
+		ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault->fault_ipa, vma_pagesize,
+					     __pfn_to_phys(fault->pfn), prot,
 					     memcache, flags);
 	}
 
 out_unlock:
-	kvm_release_faultin_page(kvm, page, !!ret, writable);
+	kvm_release_faultin_page(kvm, fault->page, !!ret, writable);
 	kvm_fault_unlock(kvm);
 
 	/* Mark the page dirty only if the fault is handled successfully */
 	if (writable && !ret)
-		mark_page_dirty_in_slot(kvm, memslot, gfn);
+		mark_page_dirty_in_slot(kvm, fault->slot, fault->gfn);
 
 	return ret != -EAGAIN ? ret : 0;
 }
@@ -1814,12 +1806,17 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
 static int __kvm_handle_guest_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 				    unsigned long esr)
 {
-	struct kvm_s2_trans nested_trans, *nested = NULL;
-	struct kvm_memory_slot *memslot;
-	bool write_fault, writable;
-	unsigned long hva;
-	phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
-	gfn_t gfn;
+	struct kvm_page_fault fault = {
+		.fault_ipa = fault_ipa,
+		.esr = esr,
+		.ipa = fault_ipa,
+
+		.write = kvm_is_write_fault(vcpu),
+		.exec  = kvm_vcpu_trap_is_exec_fault(vcpu),
+		.is_perm = esr_fsc_is_permission_fault(esr),
+	};
+	struct kvm_s2_trans nested_trans;
+	bool writable;
 	int ret;
 
 	/*
@@ -1849,15 +1846,14 @@ static int __kvm_handle_guest_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa
 			return ret;
 		}
 
-		ipa = kvm_s2_trans_output(&nested_trans);
-		nested = &nested_trans;
+		fault.ipa = kvm_s2_trans_output(&nested_trans);
+		fault.nested = &nested_trans;
 	}
 
-	gfn = ipa >> PAGE_SHIFT;
-	memslot = gfn_to_memslot(vcpu->kvm, gfn);
-	hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
-	write_fault = kvm_is_write_fault(vcpu);
-	if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
+	fault.gfn = fault.ipa >> PAGE_SHIFT;
+	fault.slot = gfn_to_memslot(vcpu->kvm, fault.gfn);
+	fault.hva = gfn_to_hva_memslot_prot(fault.slot, fault.gfn, &writable);
+	if (kvm_is_error_hva(fault.hva) || (fault.write && !writable)) {
 		/*
 		 * The guest has put either its instructions or its page-tables
 		 * somewhere it shouldn't have. Userspace won't be able to do
@@ -1882,7 +1878,7 @@ static int __kvm_handle_guest_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa
 		 * So let's assume that the guest is just being
 		 * cautious, and skip the instruction.
 		 */
-		if (kvm_is_error_hva(hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
+		if (kvm_is_error_hva(fault.hva) && kvm_vcpu_dabt_is_cm(vcpu)) {
 			kvm_incr_pc(vcpu);
 			return 1;
 		}
@@ -1893,20 +1889,19 @@ static int __kvm_handle_guest_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa
 		 * faulting VA. This is always 12 bits, irrespective
 		 * of the page size.
 		 */
-		ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
-		return io_mem_abort(vcpu, ipa);
+		fault.ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
+		return io_mem_abort(vcpu, fault.ipa);
 	}
 
 	/* Userspace should not be able to register out-of-bounds IPAs */
-	VM_BUG_ON(ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
+	VM_BUG_ON(fault.ipa >= kvm_phys_size(vcpu->arch.hw_mmu));
 
 	if (esr_fsc_is_access_flag_fault(esr)) {
-		handle_access_fault(vcpu, fault_ipa);
+		handle_access_fault(vcpu, fault.fault_ipa);
 		return 1;
 	}
 
-	ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
-			     esr_fsc_is_permission_fault(esr));
+	ret = user_mem_abort(vcpu, &fault);
 	if (ret == 0)
 		ret = 1;
 out:
-- 
2.51.0.261.g7ce5a0a67e-goog

Re: [RFC PATCH 05/16] KVM: arm64: Introduce "struct kvm_page_fault" for tracking abort state

Posted by Oliver Upton 1 month, 1 week ago

Hey Sean,

On Thu, Aug 21, 2025 at 02:00:31PM -0700, Sean Christopherson wrote:
> Add and use a kvm_page_fault structure to track state when handling a
> guest abort.  Collecting everything in a single structure will enable a
> variety of cleanups (reduce the number of params passed to helpers), and
> will pave the way toward using "struct kvm_page_fault" in arch-neutral KVM
> code, e.g. to consolidate logic for KVM_EXIT_MEMORY_FAULT.
> 
> No functional change intended.
> 
> Cc: James Houghton <jthoughton@google.com>
> Link: https://lore.kernel.org/all/20250618042424.330664-1-jthoughton@google.com
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>  arch/arm64/include/asm/kvm_host.h |  18 ++++
>  arch/arm64/kvm/mmu.c              | 143 ++++++++++++++----------------
>  2 files changed, 87 insertions(+), 74 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index 2f2394cce24e..4623cbc1edf4 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -413,6 +413,24 @@ struct kvm_vcpu_fault_info {
>  	u64 disr_el1;		/* Deferred [SError] Status Register */
>  };
>  
> +struct kvm_page_fault {
> +	const u64 esr;
> +	const bool exec;
> +	const bool write;
> +	const bool is_perm;

Hmm... these might be better represented as predicates that take a
pointer to this struct and we just compute it based on ESR. That'd have
the benefit in the arch-neutral code where 'struct kvm_page_fault' is an
opaque type and we don't need to align field names/types.

> +	phys_addr_t fault_ipa; /* The address we faulted on */
> +	phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */

NYC, but this also seems like a good opportunity to rename + retype
these guys. Specifically:

	fault_ipa => ipa
	ipa => canonical_ipa

would clarify these and align with the verbiage we currently use to talk
about nested.

Thanks,
Oliver

Re: [RFC PATCH 05/16] KVM: arm64: Introduce "struct kvm_page_fault" for tracking abort state

Posted by Sean Christopherson 1 month, 1 week ago

On Thu, Aug 21, 2025, Oliver Upton wrote:
> Hey Sean,
> 
> On Thu, Aug 21, 2025 at 02:00:31PM -0700, Sean Christopherson wrote:
> > Add and use a kvm_page_fault structure to track state when handling a
> > guest abort.  Collecting everything in a single structure will enable a
> > variety of cleanups (reduce the number of params passed to helpers), and
> > will pave the way toward using "struct kvm_page_fault" in arch-neutral KVM
> > code, e.g. to consolidate logic for KVM_EXIT_MEMORY_FAULT.
> > 
> > No functional change intended.
> > 
> > Cc: James Houghton <jthoughton@google.com>
> > Link: https://lore.kernel.org/all/20250618042424.330664-1-jthoughton@google.com
> > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > ---
> >  arch/arm64/include/asm/kvm_host.h |  18 ++++
> >  arch/arm64/kvm/mmu.c              | 143 ++++++++++++++----------------
> >  2 files changed, 87 insertions(+), 74 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > index 2f2394cce24e..4623cbc1edf4 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -413,6 +413,24 @@ struct kvm_vcpu_fault_info {
> >  	u64 disr_el1;		/* Deferred [SError] Status Register */
> >  };
> >  
> > +struct kvm_page_fault {
> > +	const u64 esr;
> > +	const bool exec;
> > +	const bool write;
> > +	const bool is_perm;
> 
> Hmm... these might be better represented as predicates that take a
> pointer to this struct and we just compute it based on ESR. That'd have
> the benefit in the arch-neutral code where 'struct kvm_page_fault' is an
> opaque type and we don't need to align field names/types.

We'd need to align function names/types though, so to some extent it's six of one,
half dozen of the other.  My slight preference would be to require kvm_page_fault
to have certain fields, but I'm ok with making kvm_page_fault opaque to generic
code and instead adding arch APIs.  Having a handful of wrappers in x86 isn't the
end of the world, and it would be more familiar for pretty much everyone.

> > +	phys_addr_t fault_ipa; /* The address we faulted on */
> > +	phys_addr_t ipa; /* Always the IPA in the L1 guest phys space */
> 
> NYC, but this also seems like a good opportunity to rename + retype
> these guys. Specifically:
> 
> 	fault_ipa => ipa
> 	ipa => canonical_ipa
> 
> would clarify these and align with the verbiage we currently use to talk
> about nested.

Heh, I'm so screwed.  x86's use of "canonical" is wildly different.  I can add
a patch to do those renames (I think doing an "opportunistic" rename would be a
bit much).

Re: [RFC PATCH 05/16] KVM: arm64: Introduce "struct kvm_page_fault" for tracking abort state

Posted by Oliver Upton 1 month, 1 week ago

On Tue, Aug 26, 2025 at 11:58:10AM -0700, Sean Christopherson wrote:
> On Thu, Aug 21, 2025, Oliver Upton wrote:
> > Hey Sean,
> > 
> > On Thu, Aug 21, 2025 at 02:00:31PM -0700, Sean Christopherson wrote:
> > > Add and use a kvm_page_fault structure to track state when handling a
> > > guest abort.  Collecting everything in a single structure will enable a
> > > variety of cleanups (reduce the number of params passed to helpers), and
> > > will pave the way toward using "struct kvm_page_fault" in arch-neutral KVM
> > > code, e.g. to consolidate logic for KVM_EXIT_MEMORY_FAULT.
> > > 
> > > No functional change intended.
> > > 
> > > Cc: James Houghton <jthoughton@google.com>
> > > Link: https://lore.kernel.org/all/20250618042424.330664-1-jthoughton@google.com
> > > Signed-off-by: Sean Christopherson <seanjc@google.com>
> > > ---
> > >  arch/arm64/include/asm/kvm_host.h |  18 ++++
> > >  arch/arm64/kvm/mmu.c              | 143 ++++++++++++++----------------
> > >  2 files changed, 87 insertions(+), 74 deletions(-)
> > > 
> > > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > > index 2f2394cce24e..4623cbc1edf4 100644
> > > --- a/arch/arm64/include/asm/kvm_host.h
> > > +++ b/arch/arm64/include/asm/kvm_host.h
> > > @@ -413,6 +413,24 @@ struct kvm_vcpu_fault_info {
> > >  	u64 disr_el1;		/* Deferred [SError] Status Register */
> > >  };
> > >  
> > > +struct kvm_page_fault {
> > > +	const u64 esr;
> > > +	const bool exec;
> > > +	const bool write;
> > > +	const bool is_perm;
> > 
> > Hmm... these might be better represented as predicates that take a
> > pointer to this struct and we just compute it based on ESR. That'd have
> > the benefit in the arch-neutral code where 'struct kvm_page_fault' is an
> > opaque type and we don't need to align field names/types.
> 
> We'd need to align function names/types though, so to some extent it's six of one,
> half dozen of the other.  My slight preference would be to require kvm_page_fault
> to have certain fields, but I'm ok with making kvm_page_fault opaque to generic
> code and instead adding arch APIs.  Having a handful of wrappers in x86 isn't the
> end of the world, and it would be more familiar for pretty much everyone.

To clarify my earlier point, my actual interest is in using ESR as the
source of truth from the arch POV, interface to the arch-neutral code
isn't that big of a deal either way.

Thanks,
Oliver

Re: [RFC PATCH 05/16] KVM: arm64: Introduce "struct kvm_page_fault" for tracking abort state

Posted by Sean Christopherson 1 month, 1 week ago

On Tue, Aug 26, 2025, Oliver Upton wrote:
> On Tue, Aug 26, 2025 at 11:58:10AM -0700, Sean Christopherson wrote:
> > On Thu, Aug 21, 2025, Oliver Upton wrote:
> > > > +struct kvm_page_fault {
> > > > +	const u64 esr;
> > > > +	const bool exec;
> > > > +	const bool write;
> > > > +	const bool is_perm;
> > > 
> > > Hmm... these might be better represented as predicates that take a
> > > pointer to this struct and we just compute it based on ESR. That'd have
> > > the benefit in the arch-neutral code where 'struct kvm_page_fault' is an
> > > opaque type and we don't need to align field names/types.
> > 
> > We'd need to align function names/types though, so to some extent it's six of one,
> > half dozen of the other.  My slight preference would be to require kvm_page_fault
> > to have certain fields, but I'm ok with making kvm_page_fault opaque to generic
> > code and instead adding arch APIs.  Having a handful of wrappers in x86 isn't the
> > end of the world, and it would be more familiar for pretty much everyone.
> 
> To clarify my earlier point, my actual interest is in using ESR as the
> source of truth from the arch POV, interface to the arch-neutral code
> isn't that big of a deal either way.

Ya, but that would mean having something like

  static bool kvm_is_exec_fault(struct kvm_page_fault *fault)
  {
	return esr_trap_is_iabt(fault->esr) && !esr_abt_iss1tw(fault->esr);
  }

and

  if (kvm_is_exec_fault(fault))

in arm64 code and then

  if (fault->exec)

in arch-neutral code, which, eww.

I like the idea of having a single source of truth, but that's going to be a
massive amount of work to do it "right", e.g. O(weeks) if not O(months).  E.g. to
replace fault->exec with kvm_is_exec_fault(), AFAICT it would require duplicating
all of kvm_is_write_fault().  Rinse and repeat for 20+ APIs in kvm_emulate.h that
take a vCPU and pull ESR from vcpu->arch.fault.esr_el2.

As an intermediate state, having that many duplicate APIs is tolerable, but I
wouldn't want to leave that as the "end" state for any kernel release, and ideally
not for any given series.  That means adding a pile of esr-based APIs, converting
_all_ users, then dropping the vcpu-based APIs.  That's a lot of code and patches.

E.g. even if we convert all of kvm_handle_guest_abort(), which itself is a big task,
there will still be usage of many of the APIs in at least kvm_translate_vncr(),
io_mem_abort(), and kvm_handle_mmio_return().  Converting all of those is totally
doable, e.g. through a combination of using kvm_page_fault and local snapshots of
esr, but it will be a lot of work and churn.

The work+churn itself doesn't bother me, but I would prefer not to block arch-neutral
usage of kvm_page_fault for months on end, nor do I want to leave KVM arm64 in
a half-baked state, i.e. I wouldn't feel comfortable converting just
__kvm_handle_guest_abort() and walking away.

What if we keep the exec, write, and is_perm fields for now, but add proper APIs
to access kvm_page_fault from common code?  The APIs would be largely duplicate
code between x86 and arm64 (though I think kvm_get_fault_gpa() would be different,
so yay), but that's not a big deal.  That way common KVM can start building out
functionality based on kvm_page_fault, and arm64 can independently convert to
making fault->esr the single source of truth, without having to worry about
perturbing common code.