[v2] KVM: Introduce KVM Userfault

[PATCH v2 05/13] KVM: x86/mmu: Add support for KVM_MEM_USERFAULT

Posted by James Houghton 1 year ago

Adhering to the requirements of KVM Userfault:

1. Zap all sptes for the memslot when KVM_MEM_USERFAULT is toggled on
   with kvm_arch_flush_shadow_memslot().
2. Only all PAGE_SIZE sptes when KVM_MEM_USERFAULT is enabled (for both
   normal/GUP memory and guest_memfd memory).
3. Reconstruct huge mappings when KVM_MEM_USERFAULT is toggled off with
   kvm_mmu_recover_huge_pages(). This is the behavior when dirty logging
   is disabled; remain consistent with it.

With the new logic in kvm_mmu_slot_apply_flags(), I've simplified the
two dirty-logging-toggle checks into one, and I have dropped the
WARN_ON() that was there.

Signed-off-by: James Houghton <jthoughton@google.com>
---
 arch/x86/kvm/Kconfig            |  1 +
 arch/x86/kvm/mmu/mmu.c          | 27 +++++++++++++++++++++----
 arch/x86/kvm/mmu/mmu_internal.h | 20 +++++++++++++++---
 arch/x86/kvm/x86.c              | 36 ++++++++++++++++++++++++---------
 include/linux/kvm_host.h        |  5 ++++-
 5 files changed, 71 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ea2c4f21c1ca..286c6825cd1c 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -47,6 +47,7 @@ config KVM_X86
 	select KVM_GENERIC_PRE_FAULT_MEMORY
 	select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM
 	select KVM_WERROR if WERROR
+	select HAVE_KVM_USERFAULT
 
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 2401606db260..5cab2785b97f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4280,14 +4280,19 @@ static inline u8 kvm_max_level_for_order(int order)
 	return PG_LEVEL_4K;
 }
 
-static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
-					u8 max_level, int gmem_order)
+static u8 kvm_max_private_mapping_level(struct kvm *kvm,
+					struct kvm_memory_slot *slot,
+					kvm_pfn_t pfn, u8 max_level,
+					int gmem_order)
 {
 	u8 req_max_level;
 
 	if (max_level == PG_LEVEL_4K)
 		return PG_LEVEL_4K;
 
+	if (kvm_memslot_userfault(slot))
+		return PG_LEVEL_4K;
+
 	max_level = min(kvm_max_level_for_order(gmem_order), max_level);
 	if (max_level == PG_LEVEL_4K)
 		return PG_LEVEL_4K;
@@ -4324,8 +4329,10 @@ static int kvm_mmu_faultin_pfn_private(struct kvm_vcpu *vcpu,
 	}
 
 	fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
-	fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->pfn,
-							 fault->max_level, max_order);
+	fault->max_level = kvm_max_private_mapping_level(vcpu->kvm, fault->slot,
+							 fault->pfn,
+							 fault->max_level,
+							 max_order);
 
 	return RET_PF_CONTINUE;
 }
@@ -4334,6 +4341,18 @@ static int __kvm_mmu_faultin_pfn(struct kvm_vcpu *vcpu,
 				 struct kvm_page_fault *fault)
 {
 	unsigned int foll = fault->write ? FOLL_WRITE : 0;
+	int userfault;
+
+	userfault = kvm_gfn_userfault(vcpu->kvm, fault->slot, fault->gfn);
+	if (userfault < 0)
+		return userfault;
+	if (userfault) {
+		kvm_mmu_prepare_userfault_exit(vcpu, fault);
+		return -EFAULT;
+	}
+
+	if (kvm_memslot_userfault(fault->slot))
+		fault->max_level = PG_LEVEL_4K;
 
 	if (fault->is_private)
 		return kvm_mmu_faultin_pfn_private(vcpu, fault);
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index b00abbe3f6cf..15705faa3b67 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -282,12 +282,26 @@ enum {
 	RET_PF_SPURIOUS,
 };
 
-static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
-						     struct kvm_page_fault *fault)
+static inline void __kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+						       struct kvm_page_fault *fault,
+						       bool is_userfault)
 {
 	kvm_prepare_memory_fault_exit(vcpu, fault->gfn << PAGE_SHIFT,
 				      PAGE_SIZE, fault->write, fault->exec,
-				      fault->is_private);
+				      fault->is_private,
+				      is_userfault);
+}
+
+static inline void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
+						     struct kvm_page_fault *fault)
+{
+	__kvm_mmu_prepare_memory_fault_exit(vcpu, fault, false);
+}
+
+static inline void kvm_mmu_prepare_userfault_exit(struct kvm_vcpu *vcpu,
+						  struct kvm_page_fault *fault)
+{
+	__kvm_mmu_prepare_memory_fault_exit(vcpu, fault, true);
 }
 
 static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1b04092ec76a..2abb425a6514 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -13053,12 +13053,36 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 	u32 new_flags = new ? new->flags : 0;
 	bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
 
+	/*
+	 * When toggling KVM Userfault on, zap all sptes so that userfault-ness
+	 * will be respected at refault time. All new faults will only install
+	 * small sptes. Therefore, when toggling it off, recover hugepages.
+	 *
+	 * For MOVE and DELETE, there will be nothing to do, as the old
+	 * mappings will have already been deleted by
+	 * kvm_arch_flush_shadow_memslot().
+	 *
+	 * For CREATE, no mappings will have been created yet.
+	 */
+	if ((old_flags ^ new_flags) & KVM_MEM_USERFAULT &&
+	    (change == KVM_MR_FLAGS_ONLY)) {
+		if (old_flags & KVM_MEM_USERFAULT)
+			kvm_mmu_recover_huge_pages(kvm, new);
+		else
+			kvm_arch_flush_shadow_memslot(kvm, old);
+	}
+
+	/*
+	 * Nothing more to do if dirty logging isn't being toggled.
+	 */
+	if (!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES))
+		return;
+
 	/*
 	 * Update CPU dirty logging if dirty logging is being toggled.  This
 	 * applies to all operations.
 	 */
-	if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)
-		kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
+	kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
 
 	/*
 	 * Nothing more to do for RO slots (which can't be dirtied and can't be
@@ -13078,14 +13102,6 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
 	if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY))
 		return;
 
-	/*
-	 * READONLY and non-flags changes were filtered out above, and the only
-	 * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
-	 * logging isn't being toggled on or off.
-	 */
-	if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)))
-		return;
-
 	if (!log_dirty_pages) {
 		/*
 		 * Recover huge page mappings in the slot now that dirty logging
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f7a3dfd5e224..9e8a8dcf2b73 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2465,7 +2465,8 @@ static inline void kvm_account_pgtable_pages(void *virt, int nr)
 static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
 						 gpa_t gpa, gpa_t size,
 						 bool is_write, bool is_exec,
-						 bool is_private)
+						 bool is_private,
+						 bool is_userfault)
 {
 	vcpu->run->exit_reason = KVM_EXIT_MEMORY_FAULT;
 	vcpu->run->memory_fault.gpa = gpa;
@@ -2475,6 +2476,8 @@ static inline void kvm_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
 	vcpu->run->memory_fault.flags = 0;
 	if (is_private)
 		vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_PRIVATE;
+	if (is_userfault)
+		vcpu->run->memory_fault.flags |= KVM_MEMORY_EXIT_FLAG_USERFAULT;
 }
 
 #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
-- 
2.47.1.613.gc27f4b7a9f-goog

Re: [PATCH v2 05/13] KVM: x86/mmu: Add support for KVM_MEM_USERFAULT

Posted by Sean Christopherson 9 months, 1 week ago

On Thu, Jan 09, 2025, James Houghton wrote:
> Adhering to the requirements of KVM Userfault:
> 
> 1. Zap all sptes for the memslot when KVM_MEM_USERFAULT is toggled on
>    with kvm_arch_flush_shadow_memslot().
> 2. Only all PAGE_SIZE sptes when KVM_MEM_USERFAULT is enabled (for both
>    normal/GUP memory and guest_memfd memory).
> 3. Reconstruct huge mappings when KVM_MEM_USERFAULT is toggled off with
>    kvm_mmu_recover_huge_pages(). This is the behavior when dirty logging
>    is disabled; remain consistent with it.
> 
> With the new logic in kvm_mmu_slot_apply_flags(), I've simplified the
> two dirty-logging-toggle checks into one, and I have dropped the
> WARN_ON() that was there.
> 
> Signed-off-by: James Houghton <jthoughton@google.com>
> ---
>  arch/x86/kvm/Kconfig            |  1 +
>  arch/x86/kvm/mmu/mmu.c          | 27 +++++++++++++++++++++----
>  arch/x86/kvm/mmu/mmu_internal.h | 20 +++++++++++++++---
>  arch/x86/kvm/x86.c              | 36 ++++++++++++++++++++++++---------
>  include/linux/kvm_host.h        |  5 ++++-
>  5 files changed, 71 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
> index ea2c4f21c1ca..286c6825cd1c 100644
> --- a/arch/x86/kvm/Kconfig
> +++ b/arch/x86/kvm/Kconfig
> @@ -47,6 +47,7 @@ config KVM_X86
>  	select KVM_GENERIC_PRE_FAULT_MEMORY
>  	select KVM_GENERIC_PRIVATE_MEM if KVM_SW_PROTECTED_VM
>  	select KVM_WERROR if WERROR
> +	select HAVE_KVM_USERFAULT
>  
>  config KVM
>  	tristate "Kernel-based Virtual Machine (KVM) support"
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 2401606db260..5cab2785b97f 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -4280,14 +4280,19 @@ static inline u8 kvm_max_level_for_order(int order)
>  	return PG_LEVEL_4K;
>  }
>  
> -static u8 kvm_max_private_mapping_level(struct kvm *kvm, kvm_pfn_t pfn,
> -					u8 max_level, int gmem_order)
> +static u8 kvm_max_private_mapping_level(struct kvm *kvm,
> +					struct kvm_memory_slot *slot,
> +					kvm_pfn_t pfn, u8 max_level,
> +					int gmem_order)
>  {
>  	u8 req_max_level;
>  
>  	if (max_level == PG_LEVEL_4K)
>  		return PG_LEVEL_4K;
>  
> +	if (kvm_memslot_userfault(slot))

Unless I'm missing something, this can go in kvm_mmu_hugepage_adjust():

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a4439e9e0726..49eb6b9b268c 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3304,7 +3304,7 @@ void kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
        if (is_error_noslot_pfn(fault->pfn))
                return;
 
-       if (kvm_slot_dirty_track_enabled(slot))
+       if (kvm_slot_dirty_track_enabled(slot) || kvm_is_userfault_memslot(slot))
                return;
 
>  static inline int kvm_mmu_do_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1b04092ec76a..2abb425a6514 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -13053,12 +13053,36 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
>  	u32 new_flags = new ? new->flags : 0;
>  	bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
>  
> +	/*
> +	 * When toggling KVM Userfault on, zap all sptes so that userfault-ness
> +	 * will be respected at refault time. All new faults will only install
> +	 * small sptes. Therefore, when toggling it off, recover hugepages.
> +	 *
> +	 * For MOVE and DELETE, there will be nothing to do, as the old
> +	 * mappings will have already been deleted by
> +	 * kvm_arch_flush_shadow_memslot().
> +	 *
> +	 * For CREATE, no mappings will have been created yet.
> +	 */

Eh, trim this down and the reference the comment below to explain why FLAGS_ONLY
is the only case that needs to be handled.

> +	if ((old_flags ^ new_flags) & KVM_MEM_USERFAULT &&
> +	    (change == KVM_MR_FLAGS_ONLY)) {
> +		if (old_flags & KVM_MEM_USERFAULT)
> +			kvm_mmu_recover_huge_pages(kvm, new);
> +		else
> +			kvm_arch_flush_shadow_memslot(kvm, old);

The call to kvm_arch_flush_shadow_memslot() should definitely go in common code.
The fancy recovery logic is arch specific, but blasting the memslot when userfault
is toggled on is not.

Re: [PATCH v2 05/13] KVM: x86/mmu: Add support for KVM_MEM_USERFAULT

Posted by Oliver Upton 8 months, 2 weeks ago

On Tue, May 06, 2025 at 05:05:50PM -0700, Sean Christopherson wrote:
> > +	if ((old_flags ^ new_flags) & KVM_MEM_USERFAULT &&
> > +	    (change == KVM_MR_FLAGS_ONLY)) {
> > +		if (old_flags & KVM_MEM_USERFAULT)
> > +			kvm_mmu_recover_huge_pages(kvm, new);
> > +		else
> > +			kvm_arch_flush_shadow_memslot(kvm, old);
> 
> The call to kvm_arch_flush_shadow_memslot() should definitely go in common code.
> The fancy recovery logic is arch specific, but blasting the memslot when userfault
> is toggled on is not.

Not like anything in KVM is consistent but sprinkling translation
changes / invalidations between arch and generic code feels
error-prone. Especially if there isn't clear ownership of a particular
flag, e.g. 0 -> 1 transitions happen in generic code and 1 -> 0 happens
in arch code.

Even in the case of KVM_MEM_USERFAULT, an architecture could potentially
preserve the stage-2 translations but reap access permissions without
modifying page tables / TLBs.

I'm happy with arch interfaces that clearly express intent (make this
memslot inaccessible), then the architecture can make an informed
decision about how to best achieve that. Otherwise we're always going to
use the largest possible hammer potentially overinvalidate.

Thanks,
Oliver

Re: [PATCH v2 05/13] KVM: x86/mmu: Add support for KVM_MEM_USERFAULT

Posted by Sean Christopherson 8 months, 2 weeks ago

On Wed, May 28, 2025, Oliver Upton wrote:
> On Tue, May 06, 2025 at 05:05:50PM -0700, Sean Christopherson wrote:
> > > +	if ((old_flags ^ new_flags) & KVM_MEM_USERFAULT &&
> > > +	    (change == KVM_MR_FLAGS_ONLY)) {
> > > +		if (old_flags & KVM_MEM_USERFAULT)
> > > +			kvm_mmu_recover_huge_pages(kvm, new);
> > > +		else
> > > +			kvm_arch_flush_shadow_memslot(kvm, old);
> > 
> > The call to kvm_arch_flush_shadow_memslot() should definitely go in common code.
> > The fancy recovery logic is arch specific, but blasting the memslot when userfault
> > is toggled on is not.
> 
> Not like anything in KVM is consistent but sprinkling translation
> changes / invalidations between arch and generic code feels
> error-prone.

Eh, leaving critical operations to arch code isn't exactly error free either :-)

> Especially if there isn't clear ownership of a particular flag, e.g. 0 -> 1
> transitions happen in generic code and 1 -> 0 happens in arch code.

The difference I see is that removing access to the memslot on 0=>1 is mandatory,
whereas any action on 1=>0 is not.  So IMO it's not arbitrary sprinkling of
invalidations, it's deliberately putting the common, mandatory logic in generic
code, while leaving optional performance tweaks to arch code.

> Even in the case of KVM_MEM_USERFAULT, an architecture could potentially
> preserve the stage-2 translations but reap access permissions without
> modifying page tables / TLBs.

Yes, but that wouldn't be strictly unique to KVM_MEM_USERFAULT.

E.g. for NUMA balancing faults (or rather, the PROT_NONE conversions), KVM could
handle the mmu_notifier invalidations by removing access while keeping the PTEs,
so that faulting the memory back would be a lighter weight operation.  Ditto for
reacting to other protection changes that come through mmu_notifiers.

If we want to go down that general path, my preference would be to put the control
logic in generic code, and then call dedicated arch APIs for removing protections.

> I'm happy with arch interfaces that clearly express intent (make this
> memslot inaccessible), then the architecture can make an informed
> decision about how to best achieve that. Otherwise we're always going to
> use the largest possible hammer potentially overinvalidate.

Yeah, definitely no argument there given x86's history in this area.  Though if
we want to tackle that problem straightaway, I think I'd vote to add the
aforementioned dedicated APIs for removing protections, with a generic default
implementation that simply invokes kvm_arch_flush_shadow_memslot().

Re: [PATCH v2 05/13] KVM: x86/mmu: Add support for KVM_MEM_USERFAULT

Posted by Sean Christopherson 8 months, 2 weeks ago

On Wed, May 28, 2025, Sean Christopherson wrote:
> On Wed, May 28, 2025, Oliver Upton wrote:
> > On Tue, May 06, 2025 at 05:05:50PM -0700, Sean Christopherson wrote:
> > > > +	if ((old_flags ^ new_flags) & KVM_MEM_USERFAULT &&
> > > > +	    (change == KVM_MR_FLAGS_ONLY)) {
> > > > +		if (old_flags & KVM_MEM_USERFAULT)
> > > > +			kvm_mmu_recover_huge_pages(kvm, new);
> > > > +		else
> > > > +			kvm_arch_flush_shadow_memslot(kvm, old);
> > > 
> > > The call to kvm_arch_flush_shadow_memslot() should definitely go in common code.
> > > The fancy recovery logic is arch specific, but blasting the memslot when userfault
> > > is toggled on is not.
> > 
> > Not like anything in KVM is consistent but sprinkling translation
> > changes / invalidations between arch and generic code feels
> > error-prone.
> 
> Eh, leaving critical operations to arch code isn't exactly error free either :-)
> 
> > Especially if there isn't clear ownership of a particular flag, e.g. 0 -> 1
> > transitions happen in generic code and 1 -> 0 happens in arch code.
> 
> The difference I see is that removing access to the memslot on 0=>1 is mandatory,
> whereas any action on 1=>0 is not.  So IMO it's not arbitrary sprinkling of
> invalidations, it's deliberately putting the common, mandatory logic in generic
> code, while leaving optional performance tweaks to arch code.
> 
> > Even in the case of KVM_MEM_USERFAULT, an architecture could potentially
> > preserve the stage-2 translations but reap access permissions without
> > modifying page tables / TLBs.
> 
> Yes, but that wouldn't be strictly unique to KVM_MEM_USERFAULT.
> 
> E.g. for NUMA balancing faults (or rather, the PROT_NONE conversions), KVM could
> handle the mmu_notifier invalidations by removing access while keeping the PTEs,
> so that faulting the memory back would be a lighter weight operation.  Ditto for
> reacting to other protection changes that come through mmu_notifiers.
> 
> If we want to go down that general path, my preference would be to put the control
> logic in generic code, and then call dedicated arch APIs for removing protections.
> 
> > I'm happy with arch interfaces that clearly express intent (make this
> > memslot inaccessible), then the architecture can make an informed
> > decision about how to best achieve that. Otherwise we're always going to
> > use the largest possible hammer potentially overinvalidate.
> 
> Yeah, definitely no argument there given x86's history in this area.  Though if
> we want to tackle that problem straightaway, I think I'd vote to add the
> aforementioned dedicated APIs for removing protections, with a generic default
> implementation that simply invokes kvm_arch_flush_shadow_memslot().

Alternatively, we could punt on this issue entirely by not allowing userspace to
set KVM_MEM_USERFAULT on anything but KVM_MR_CREATE.  I.e. allow a FLAGS_ONLY
update to clear USERFAULT, but not set USERFAULT.

Other than emulating poisoned pages, is there a (potential) use case for setting
KVM_MEM_USERFAULT after a VM has been created?

Re: [PATCH v2 05/13] KVM: x86/mmu: Add support for KVM_MEM_USERFAULT

Posted by James Houghton 8 months, 2 weeks ago

On Thu, May 29, 2025 at 10:56 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Wed, May 28, 2025, Sean Christopherson wrote:
> > On Wed, May 28, 2025, Oliver Upton wrote:
> > > On Tue, May 06, 2025 at 05:05:50PM -0700, Sean Christopherson wrote:
> > > > > +       if ((old_flags ^ new_flags) & KVM_MEM_USERFAULT &&
> > > > > +           (change == KVM_MR_FLAGS_ONLY)) {
> > > > > +               if (old_flags & KVM_MEM_USERFAULT)
> > > > > +                       kvm_mmu_recover_huge_pages(kvm, new);
> > > > > +               else
> > > > > +                       kvm_arch_flush_shadow_memslot(kvm, old);
> > > >
> > > > The call to kvm_arch_flush_shadow_memslot() should definitely go in common code.
> > > > The fancy recovery logic is arch specific, but blasting the memslot when userfault
> > > > is toggled on is not.
> > >
> > > Not like anything in KVM is consistent but sprinkling translation
> > > changes / invalidations between arch and generic code feels
> > > error-prone.
> >
> > Eh, leaving critical operations to arch code isn't exactly error free either :-)
> >
> > > Especially if there isn't clear ownership of a particular flag, e.g. 0 -> 1
> > > transitions happen in generic code and 1 -> 0 happens in arch code.
> >
> > The difference I see is that removing access to the memslot on 0=>1 is mandatory,
> > whereas any action on 1=>0 is not.  So IMO it's not arbitrary sprinkling of
> > invalidations, it's deliberately putting the common, mandatory logic in generic
> > code, while leaving optional performance tweaks to arch code.
> >
> > > Even in the case of KVM_MEM_USERFAULT, an architecture could potentially
> > > preserve the stage-2 translations but reap access permissions without
> > > modifying page tables / TLBs.
> >
> > Yes, but that wouldn't be strictly unique to KVM_MEM_USERFAULT.
> >
> > E.g. for NUMA balancing faults (or rather, the PROT_NONE conversions), KVM could
> > handle the mmu_notifier invalidations by removing access while keeping the PTEs,
> > so that faulting the memory back would be a lighter weight operation.  Ditto for
> > reacting to other protection changes that come through mmu_notifiers.
> >
> > If we want to go down that general path, my preference would be to put the control
> > logic in generic code, and then call dedicated arch APIs for removing protections.
> >
> > > I'm happy with arch interfaces that clearly express intent (make this
> > > memslot inaccessible), then the architecture can make an informed
> > > decision about how to best achieve that. Otherwise we're always going to
> > > use the largest possible hammer potentially overinvalidate.
> >
> > Yeah, definitely no argument there given x86's history in this area.  Though if
> > we want to tackle that problem straightaway, I think I'd vote to add the
> > aforementioned dedicated APIs for removing protections, with a generic default
> > implementation that simply invokes kvm_arch_flush_shadow_memslot().

I'm happy to add something like kvm_arch_invalidate_shadow_memslot()
which invokes kvm_arch_flush_shadow_memslot() by default (and has a
lockdep assertion for holding the slots lock), with no architectures
currently providing a specialization. Feel free to suggest better
names.

Or we could do kvm_arch_userfault_changed(/* ... */, bool enabled),
and, for the default implementation, if `enabled == true`, do
kvm_arch_invalidate_shadow_memslot(), else do nothing. Then x86 can
specialize this. This arguably still leaves the responsibility of
unmapping/invalidating everything to arch code...

Let me know your preferences, Sean and Oliver.

>
> Alternatively, we could punt on this issue entirely by not allowing userspace to
> set KVM_MEM_USERFAULT on anything but KVM_MR_CREATE.  I.e. allow a FLAGS_ONLY
> update to clear USERFAULT, but not set USERFAULT.
>
> Other than emulating poisoned pages, is there a (potential) use case for setting
> KVM_MEM_USERFAULT after a VM has been created?

Today, Google's userspace does not know when creating memslots that we
will need to enable KVM_MEM_USERFAULT. We could delete and re-add the
memslots of course, but overall, for any userspace, I think adding
this restriction (for what seems to be a non-issue :)) isn't worth it.

Thanks!