[v3] KVM: Introduce KVM Userfault

[PATCH v3 03/15] KVM: arm64: x86: Require "struct kvm_page_fault" for memory fault exits

Posted by James Houghton 3 months, 3 weeks ago

From: Sean Christopherson <seanjc@google.com>

Now that both arm64 and x86 define "struct kvm_page_fault" with a base set
of fields, rework kvm_prepare_memory_fault_exit() to take a kvm_page_fault
structure instead of passing in a pile of parameters.  Guard the related
code with CONFIG_KVM_GENERIC_PAGE_FAULT to play nice with architectures
that don't yet support kvm_page_fault.

Rather than define a common kvm_page_fault and kvm_arch_page_fault child,
simply assert that the handful of required fields are provided by the
arch-defined structure.  Unlike vCPU and VMs, the number of common fields
is expected to be small, and letting arch code fully define the structure
allows for maximum flexibility with respect to const, layout, etc.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: James Houghton <jthoughton@google.com>
---
 arch/arm64/kvm/Kconfig          |  1 +
 arch/x86/kvm/Kconfig            |  1 +
 arch/x86/kvm/mmu/mmu.c          |  8 ++++----
 arch/x86/kvm/mmu/mmu_internal.h | 10 +---------
 include/linux/kvm_host.h        | 26 ++++++++++++++++++++------
 virt/kvm/Kconfig                |  3 +++
 6 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 713248f240e03..3c299735b1668 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -37,6 +37,7 @@ menuconfig KVM
 	select HAVE_KVM_VCPU_RUN_PID_CHANGE
 	select SCHED_INFO
 	select GUEST_PERF_EVENTS if PERF_EVENTS
+	select KVM_GENERIC_PAGE_FAULT
 	help
 	  Support hosting virtualized guest machines.
 
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2eeffcec53828..2d5966f15738d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -48,6 +48,7 @@ config KVM_X86
 	select KVM_GENERIC_PRE_FAULT_MEMORY
 	select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM
 	select KVM_WERROR if WERROR
+	select KVM_GENERIC_PAGE_FAULT
 
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index cbc84c6abc2e3..a4439e9e07268 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3429,7 +3429,7 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
 	gva_t gva = fault->is_tdp ? 0 : fault->addr;
 
 	if (fault->is_private) {
-		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+		kvm_prepare_memory_fault_exit(vcpu, fault);
 		return -EFAULT;
 	}
 
@@ -4499,14 +4499,14 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
 	int max_order, r;
 
 	if (!kvm_slot_can_be_private(fault->slot)) {
-		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+		kvm_prepare_memory_fault_exit(vcpu, fault);
 		return -EFAULT;
 	}
 
 	r = kvm_gmem_get_pfn(vcpu->kvm, fault->slot, fault->gfn, &fault->pfn,
 			     &fault->refcounted_page, &max_order);
 	if (r) {
-		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+		kvm_prepare_memory_fault_exit(vcpu, fault);
 		return r;
 	}
 
@@ -4586,7 +4586,7 @@ static int kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
 	 * private vs. shared mismatch.
 	 */
 	if (fault->is_private != kvm_mem_is_private(kvm, fault->gfn)) {
-		kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
+		kvm_prepare_memory_fault_exit(vcpu, fault);
 		return -EFAULT;
 	}
 
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 384fc4d0bfec0..c15060ed6e8be 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -269,14 +269,6 @@ enum {
  */
 static_assert(RET_PF_CONTINUE == 0);
 
-static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
-						     struct kvm_page_fault *fault)
-{
-	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
-				      PAGE_SIZE, fault->write, fault->exec,
-				      fault->is_private);
-}
-
 static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 					u64 err, bool prefetch,
 					int *emulation_type, u8 *level)
@@ -329,7 +321,7 @@ static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
 	 */
 	if (r == RET_PF_EMULATE && fault.is_private) {
 		pr_warn_ratelimited("kvm: unexpected emulation request on private memory\n");
-		kvm_mmu_prepare_memory_fault_exit(vcpu, &fault);
+		kvm_prepare_memory_fault_exit(vcpu, &fault);
 		return -EFAULT;
 	}
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3bde4fb5c6aa4..9a85500cd5c50 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2497,20 +2497,34 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
 /* Max number of entries allowed for each kvm dirty ring */
 #define  KVM_DIRTY_RING_MAX_ENTRIES  65536
 
+#ifdef CONFIG_KVM_GENERIC_PAGE_FAULT
+
+#define KVM_ASSERT_TYPE_IS(type_t, x)					\
+do {									\
+	type_t __maybe_unused tmp;					\
+									\
+	BUILD_BUG_ON(!__types_ok(tmp, x) || !__typecheck(tmp, x));	\
+} while (0)
+
 static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
-						 gpa_t gpa, gpa_t size,
-						 bool is_write, bool is_exec,
-						 bool is_private)
+						 struct kvm_page_fault *fault)
 {
+	KVM_ASSERT_TYPE_IS(gfn_t, fault->gfn);
+	KVM_ASSERT_TYPE_IS(bool, fault->exec);
+	KVM_ASSERT_TYPE_IS(bool, fault->write);
+	KVM_ASSERT_TYPE_IS(bool, fault->is_private);
+	KVM_ASSERT_TYPE_IS(struct kvm_memory_slot *, fault->slot);
+
 	vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
-	vcpu->run->memory_fault.gpa = gpa;
-	vcpu->run->memory_fault.size = size;
+	vcpu->run->memory_fault.gpa = fault->gfn << PAGE_SHIFT;
+	vcpu->run->memory_fault.size = PAGE_SIZE;
 
 	/* RWX flags are not (yet) defined or communicated to userspace. */
 	vcpu->run->memory_fault.flags = 0;
-	if (is_private)
+	if (fault->is_private)
 		vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
 }
+#endif
 
 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
 static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn)
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 727b542074e7e..28ed6b241578b 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -128,3 +128,6 @@ config HAVE_KVM_ARCH_GMEM_PREPARE
 config HAVE_KVM_ARCH_GMEM_INVALIDATE
        bool
        depends on KVM_PRIVATE_MEM
+
+config KVM_GENERIC_PAGE_FAULT
+       bool
-- 
2.50.0.rc2.692.g299adb8693-goog

Re: [PATCH v3 03/15] KVM: arm64: x86: Require "struct kvm_page_fault" for memory fault exits

Posted by Oliver Upton 3 months, 3 weeks ago

On Wed, Jun 18, 2025 at 04:24:12AM +0000, James Houghton wrote:
> +#ifdef CONFIG_KVM_GENERIC_PAGE_FAULT
> +
> +#define KVM_ASSERT_TYPE_IS(type_t, x)					\
> +do {									\
> +	type_t __maybe_unused tmp;					\
> +									\
> +	BUILD_BUG_ON(!__types_ok(tmp, x) || !__typecheck(tmp, x));	\
> +} while (0)
> +
>  static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
> -						 gpa_t gpa, gpa_t size,
> -						 bool is_write, bool is_exec,
> -						 bool is_private)
> +						 struct kvm_page_fault *fault)
>  {
> +	KVM_ASSERT_TYPE_IS(gfn_t, fault->gfn);
> +	KVM_ASSERT_TYPE_IS(bool, fault->exec);
> +	KVM_ASSERT_TYPE_IS(bool, fault->write);
> +	KVM_ASSERT_TYPE_IS(bool, fault->is_private);
> +	KVM_ASSERT_TYPE_IS(struct kvm_memory_slot *, fault->slot);
> +
>  	vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> -	vcpu->run->memory_fault.gpa = gpa;
> -	vcpu->run->memory_fault.size = size;
> +	vcpu->run->memory_fault.gpa = fault->gfn << PAGE_SHIFT;
> +	vcpu->run->memory_fault.size = PAGE_SIZE;
>  
>  	/* RWX flags are not (yet) defined or communicated to userspace. */
>  	vcpu->run->memory_fault.flags = 0;
> -	if (is_private)
> +	if (fault->is_private)
>  		vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
>  }
> +#endif

This *is not* the right direction of travel for arm64. Stage-2 aborts /
EPT violations / etc. are extremely architecture-specific events.

What I would like to see on arm64 is that for every "KVM_EXIT_MEMORY_FAULT"
we provide as much syndrome information as possible. That could imply
some combination of a sanitised view of ESR_EL2 and, where it is
unambiguous, common fault flags that have shared definitions with x86.
This could incur some minor code duplication, but even then we can share
helpers for the software bits (like userfault).

FEAT_MTE_PERM is a very good example for this. There exists a "Tag"
permission at stage-2 which is unrelated to any of the 'normal'
read/write permissions. There's also the MostlyReadOnly permission from
FEAT_THE which grants write permission to a specific set of instructions.

I don't want to paper over these nuances and will happily maintain an
arm64-specific flavor of "kvm_prepare_memory_fault_exit()"/

Thanks,
Oliver

Re: [PATCH v3 03/15] KVM: arm64: x86: Require "struct kvm_page_fault" for memory fault exits

Posted by Sean Christopherson 3 months, 3 weeks ago

On Wed, Jun 18, 2025, Oliver Upton wrote:
> On Wed, Jun 18, 2025 at 04:24:12AM +0000, James Houghton wrote:
> > +#ifdef CONFIG_KVM_GENERIC_PAGE_FAULT
> > +
> > +#define KVM_ASSERT_TYPE_IS(type_t, x)					\
> > +do {									\
> > +	type_t __maybe_unused tmp;					\
> > +									\
> > +	BUILD_BUG_ON(!__types_ok(tmp, x) || !__typecheck(tmp, x));	\
> > +} while (0)
> > +
> >  static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
> > -						 gpa_t gpa, gpa_t size,
> > -						 bool is_write, bool is_exec,
> > -						 bool is_private)
> > +						 struct kvm_page_fault *fault)
> >  {
> > +	KVM_ASSERT_TYPE_IS(gfn_t, fault->gfn);
> > +	KVM_ASSERT_TYPE_IS(bool, fault->exec);
> > +	KVM_ASSERT_TYPE_IS(bool, fault->write);
> > +	KVM_ASSERT_TYPE_IS(bool, fault->is_private);
> > +	KVM_ASSERT_TYPE_IS(struct kvm_memory_slot *, fault->slot);
> > +
> >  	vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> > -	vcpu->run->memory_fault.gpa = gpa;
> > -	vcpu->run->memory_fault.size = size;
> > +	vcpu->run->memory_fault.gpa = fault->gfn << PAGE_SHIFT;
> > +	vcpu->run->memory_fault.size = PAGE_SIZE;
> >  
> >  	/* RWX flags are not (yet) defined or communicated to userspace. */
> >  	vcpu->run->memory_fault.flags = 0;
> > -	if (is_private)
> > +	if (fault->is_private)
> >  		vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
> >  }
> > +#endif
> 
> This *is not* the right direction of travel for arm64. Stage-2 aborts /
> EPT violations / etc. are extremely architecture-specific events.

Yes and no.  100% agreed there are arch/vendor specific aspects of stage-2 faults,
but there are most definitely commonalites as well.

> What I would like to see on arm64 is that for every "KVM_EXIT_MEMORY_FAULT"
> we provide as much syndrome information as possible. That could imply
> some combination of a sanitised view of ESR_EL2 and, where it is
> unambiguous, common fault flags that have shared definitions with x86.

Me confused, this is what the above does?  "struct kvm_page_fault" is arch
specific, e.g. x86 has a whole pile of stuff in there beyond gfn, exec, write,
is_private, and slot.

The approach is non-standard, but I think my justification/reasoning for having
the structure be arch-defined still holds:

 : Rather than define a common kvm_page_fault and kvm_arch_page_fault child,
 : simply assert that the handful of required fields are provided by the
 : arch-defined structure.  Unlike vCPU and VMs, the number of common fields
 : is expected to be small, and letting arch code fully define the structure
 : allows for maximum flexibility with respect to const, layout, etc.

If we could use anonymous struct field, i.e. could embed a kvm_arch_page_fault
without having to bounce through an "arch" field, I would vote for the approach.
Sadly, AFAIK, we can't yet use those in the kernel.

> This could incur some minor code duplication, but even then we can share
> helpers for the software bits (like userfault).

Again, that is what is proposed here.

> FEAT_MTE_PERM is a very good example for this. There exists a "Tag"
> permission at stage-2 which is unrelated to any of the 'normal'
> read/write permissions. There's also the MostlyReadOnly permission from
> FEAT_THE which grants write permission to a specific set of instructions.
> 
> I don't want to paper over these nuances and will happily maintain an
> arm64-specific flavor of "kvm_prepare_memory_fault_exit()"

Nothing prevents arm64 (or any arch) from wrapping kvm_prepare_memory_fault_exit()
and/or taking action after it's invoked.  That's not an accident; the "prepare
exit" helpers (x86 has a few more) were specifically designed to not be used as
the "return" to userspace.  E.g. this one returns "void" instead of -EFAULT
specifically so that the callers isn't "required" to ignore the return if the
caller wants to populate (or change, but hopefully that's never the case) fields
after calling kvm_prepare_memory_fault_exit), and so that arch can return an
entirely different error code, e.g. -EHWPOISON when appropriate.

And it's not just kvm_prepare_memory_fault_exit() that I want to use kvm_page_fault;
kvm_faultin_pfn() is another case where having a common "struct kvm_page_fault"
would clean up some ugly/annoying boilerplate.

Re: [PATCH v3 03/15] KVM: arm64: x86: Require "struct kvm_page_fault" for memory fault exits

Posted by Oliver Upton 3 months, 3 weeks ago

On Wed, Jun 18, 2025 at 01:47:36PM -0700, Sean Christopherson wrote:
> On Wed, Jun 18, 2025, Oliver Upton wrote:
> > On Wed, Jun 18, 2025 at 04:24:12AM +0000, James Houghton wrote:
> > > +#ifdef CONFIG_KVM_GENERIC_PAGE_FAULT
> > > +
> > > +#define KVM_ASSERT_TYPE_IS(type_t, x)					\
> > > +do {									\
> > > +	type_t __maybe_unused tmp;					\
> > > +									\
> > > +	BUILD_BUG_ON(!__types_ok(tmp, x) || !__typecheck(tmp, x));	\
> > > +} while (0)
> > > +
> > >  static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
> > > -						 gpa_t gpa, gpa_t size,
> > > -						 bool is_write, bool is_exec,
> > > -						 bool is_private)
> > > +						 struct kvm_page_fault *fault)
> > >  {
> > > +	KVM_ASSERT_TYPE_IS(gfn_t, fault->gfn);
> > > +	KVM_ASSERT_TYPE_IS(bool, fault->exec);
> > > +	KVM_ASSERT_TYPE_IS(bool, fault->write);
> > > +	KVM_ASSERT_TYPE_IS(bool, fault->is_private);
> > > +	KVM_ASSERT_TYPE_IS(struct kvm_memory_slot *, fault->slot);
> > > +
> > >  	vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
> > > -	vcpu->run->memory_fault.gpa = gpa;
> > > -	vcpu->run->memory_fault.size = size;
> > > +	vcpu->run->memory_fault.gpa = fault->gfn << PAGE_SHIFT;
> > > +	vcpu->run->memory_fault.size = PAGE_SIZE;
> > >  
> > >  	/* RWX flags are not (yet) defined or communicated to userspace. */
> > >  	vcpu->run->memory_fault.flags = 0;
> > > -	if (is_private)
> > > +	if (fault->is_private)
> > >  		vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
> > >  }
> > > +#endif
> > 
> > This *is not* the right direction of travel for arm64. Stage-2 aborts /
> > EPT violations / etc. are extremely architecture-specific events.
> 
> Yes and no.  100% agreed there are arch/vendor specific aspects of stage-2 faults,
> but there are most definitely commonalites as well.

And I agree those commonalities should be expressed with the same flags
where possible.

> > What I would like to see on arm64 is that for every "KVM_EXIT_MEMORY_FAULT"
> > we provide as much syndrome information as possible. That could imply
> > some combination of a sanitised view of ESR_EL2 and, where it is
> > unambiguous, common fault flags that have shared definitions with x86.
> 
> Me confused, this is what the above does?  "struct kvm_page_fault" is arch
> specific, e.g. x86 has a whole pile of stuff in there beyond gfn, exec, write,
> is_private, and slot.

Right, but now I need to remember that some of the hardware syndrome
(exec, write) is handled in the arch-neutral code and the rest belongs
to the arch.

> The approach is non-standard, but I think my justification/reasoning for having
> the structure be arch-defined still holds:
> 
>  : Rather than define a common kvm_page_fault and kvm_arch_page_fault child,
>  : simply assert that the handful of required fields are provided by the
>  : arch-defined structure.  Unlike vCPU and VMs, the number of common fields
>  : is expected to be small, and letting arch code fully define the structure
>  : allows for maximum flexibility with respect to const, layout, etc.
> 
> If we could use anonymous struct field, i.e. could embed a kvm_arch_page_fault
> without having to bounce through an "arch" field, I would vote for the approach.
> Sadly, AFAIK, we can't yet use those in the kernel.

The general impression is that this is an unnecessary amount of
complexity for doing something trivial (computing flags).

> > This could incur some minor code duplication, but even then we can share
> > helpers for the software bits (like userfault).
> 
> Again, that is what is proposed here.
> 
> > FEAT_MTE_PERM is a very good example for this. There exists a "Tag"
> > permission at stage-2 which is unrelated to any of the 'normal'
> > read/write permissions. There's also the MostlyReadOnly permission from
> > FEAT_THE which grants write permission to a specific set of instructions.
> > 
> > I don't want to paper over these nuances and will happily maintain an
> > arm64-specific flavor of "kvm_prepare_memory_fault_exit()"
> 
> Nothing prevents arm64 (or any arch) from wrapping kvm_prepare_memory_fault_exit()
> and/or taking action after it's invoked.  That's not an accident; the "prepare
> exit" helpers (x86 has a few more) were specifically designed to not be used as
> the "return" to userspace.  E.g. this one returns "void" instead of -EFAULT
> specifically so that the callers isn't "required" to ignore the return if the
> caller wants to populate (or change, but hopefully that's never the case) fields
> after calling kvm_prepare_memory_fault_exit), and so that arch can return an
> entirely different error code, e.g. -EHWPOISON when appropriate.

IMO, this does not achieve the desired layering / ownership of memory
fault triage. This would be better organized as the arch code computing
all of the flags relating to the hardware syndrome (even boring ones
like RWX) and arch-neutral code potentially lending a hand with the
software bits.

With this I either need to genericize the horrors of the Arm
architecture in the common thing or keep track of what parts of the
hardware flags are owned by arch v. non-arch. SW v. HW fault context is
a cleaner split, IMO.

> And it's not just kvm_prepare_memory_fault_exit() that I want to use kvm_page_fault;
> kvm_faultin_pfn() is another case where having a common "struct kvm_page_fault"
> would clean up some ugly/annoying boilerplate.

That might be a better starting point for unifying these things, esp.
since kvm_faultin_pfn() doesn't have UAPI implications hiding behind it
and is already using common parameters.

Thanks,
Oliver

Re: [PATCH v3 03/15] KVM: arm64: x86: Require "struct kvm_page_fault" for memory fault exits

Posted by Sean Christopherson 3 months, 3 weeks ago

On Wed, Jun 18, 2025, Oliver Upton wrote:
> On Wed, Jun 18, 2025 at 01:47:36PM -0700, Sean Christopherson wrote:
> > On Wed, Jun 18, 2025, Oliver Upton wrote:
> > > What I would like to see on arm64 is that for every "KVM_EXIT_MEMORY_FAULT"
> > > we provide as much syndrome information as possible. That could imply
> > > some combination of a sanitised view of ESR_EL2 and, where it is
> > > unambiguous, common fault flags that have shared definitions with x86.
> > 
> > Me confused, this is what the above does?  "struct kvm_page_fault" is arch
> > specific, e.g. x86 has a whole pile of stuff in there beyond gfn, exec, write,
> > is_private, and slot.
> 
> Right, but now I need to remember that some of the hardware syndrome
> (exec, write) is handled in the arch-neutral code and the rest belongs
> to the arch.

Yeah, can't argue there.

> > The approach is non-standard, but I think my justification/reasoning for having
> > the structure be arch-defined still holds:
> > 
> >  : Rather than define a common kvm_page_fault and kvm_arch_page_fault child,
> >  : simply assert that the handful of required fields are provided by the
> >  : arch-defined structure.  Unlike vCPU and VMs, the number of common fields
> >  : is expected to be small, and letting arch code fully define the structure
> >  : allows for maximum flexibility with respect to const, layout, etc.
> > 
> > If we could use anonymous struct field, i.e. could embed a kvm_arch_page_fault
> > without having to bounce through an "arch" field, I would vote for the approach.
> > Sadly, AFAIK, we can't yet use those in the kernel.
> 
> The general impression is that this is an unnecessary amount of complexity
> for doing something trivial (computing flags).

It looks pretty though!

> > Nothing prevents arm64 (or any arch) from wrapping kvm_prepare_memory_fault_exit()
> > and/or taking action after it's invoked.  That's not an accident; the "prepare
> > exit" helpers (x86 has a few more) were specifically designed to not be used as
> > the "return" to userspace.  E.g. this one returns "void" instead of -EFAULT
> > specifically so that the callers isn't "required" to ignore the return if the
> > caller wants to populate (or change, but hopefully that's never the case) fields
> > after calling kvm_prepare_memory_fault_exit), and so that arch can return an
> > entirely different error code, e.g. -EHWPOISON when appropriate.
> 
> IMO, this does not achieve the desired layering / ownership of memory
> fault triage. This would be better organized as the arch code computing
> all of the flags relating to the hardware syndrome (even boring ones
> like RWX) 

Just to make sure I'm not misinterpreting things, by "computing all of the flags",
you mean computing KVM_MEMORY_EXIT_FLAG_xxx flags that are derived from hardware
state, correct?

> and arch-neutral code potentially lending a hand with the software bits.
>
> With this I either need to genericize the horrors of the Arm
> architecture in the common thing or keep track of what parts of the
> hardware flags are owned by arch v. non-arch. SW v. HW fault context is
> a cleaner split, IMO.

The problem I'm struggling with is where to draw the line.  If we leave hardware
state to arch code, then we're not left with much.  Hmm, but it really is just
the gfn/gpa that's needed in common code to avoid true ugliness.  The size is
technically arch specific, but the reported size is effectively a placeholder,
i.e. it's always PAGE_SIZE, and probably always will be PAGE_SIZE, but we wanted
to give ourselves an out if necessary.

Would you be ok having common code fill gpa and size?  If so, then we can do this:

--
void kvm_arch_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
					struct kvm_page_fault *fault);

static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
						 struct kvm_page_fault *fault)
{
	KVM_ASSERT_TYPE_IS(gfn_t, fault->gfn);

	vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
	vcpu->run->memory_fault.gpa = fault->gfn << PAGE_SHIFT;
	vcpu->run->memory_fault.size = PAGE_SIZE;

	vcpu->run->memory_fault.flags = 0;
	kvm_arch_prepare_memory_fault_exit(vcpu, fault);
}
--

where arm64's arch hook is empty, and x86's is:

--
static inline void kvm_arch_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
						      struct kvm_page_fault *fault)
{
	if (fault->is_private)
		vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
}
--

It's not perfect, but it should be much easier to describe the contract, and
common code can still pass around a kvm_page_fault structure instead of a horde
of booleans.