[PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn

James Houghton posted 11 patches 1 year, 3 months ago
There is a newer version of this series
[PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
Posted by James Houghton 1 year, 3 months ago
Walk the TDP MMU in an RCU read-side critical section without holding
mmu_lock when harvesting and potentially updating age information on
sptes. This requires a way to do RCU-safe walking of the tdp_mmu_roots;
do this with a new macro. The PTE modifications are now done atomically,
and kvm_tdp_mmu_spte_need_atomic_write() has been updated to account for
the fact that kvm_age_gfn can now locklessly update the accessed bit and
the W/R/X bits).

If the cmpxchg for marking the spte for access tracking fails, leave it
as is and treat it as if it were young, as if the spte is being actively
modified, it is most likely young.

Harvesting age information from the shadow MMU is still done while
holding the MMU write lock.

Suggested-by: Yu Zhao <yuzhao@google.com>
Signed-off-by: James Houghton <jthoughton@google.com>
Reviewed-by: David Matlack <dmatlack@google.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/Kconfig            |  1 +
 arch/x86/kvm/mmu/mmu.c          | 10 ++++++++--
 arch/x86/kvm/mmu/tdp_iter.h     | 12 ++++++------
 arch/x86/kvm/mmu/tdp_mmu.c      | 23 ++++++++++++++++-------
 5 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 70c7ed0ef184..84ee08078686 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1455,6 +1455,7 @@ struct kvm_arch {
 	 * tdp_mmu_page set.
 	 *
 	 * For reads, this list is protected by:
+	 *	RCU alone or
 	 *	the MMU lock in read mode + RCU or
 	 *	the MMU lock in write mode
 	 *
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 1ed1e4f5d51c..97f747d60fe9 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -23,6 +23,7 @@ config KVM_X86
 	select KVM_COMMON
 	select KVM_GENERIC_MMU_NOTIFIER
 	select KVM_ELIDE_TLB_FLUSH_IF_YOUNG
+	select KVM_MMU_NOTIFIER_YOUNG_LOCKLESS
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_PFNCACHE
 	select HAVE_KVM_DIRTY_RING_TSO
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 443845bb2e01..26797ccd34d8 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1586,8 +1586,11 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	bool young = false;
 
-	if (kvm_memslots_have_rmaps(kvm))
+	if (kvm_memslots_have_rmaps(kvm)) {
+		write_lock(&kvm->mmu_lock);
 		young = kvm_rmap_age_gfn_range(kvm, range, false);
+		write_unlock(&kvm->mmu_lock);
+	}
 
 	if (tdp_mmu_enabled)
 		young |= kvm_tdp_mmu_age_gfn_range(kvm, range);
@@ -1599,8 +1602,11 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
 {
 	bool young = false;
 
-	if (kvm_memslots_have_rmaps(kvm))
+	if (kvm_memslots_have_rmaps(kvm)) {
+		write_lock(&kvm->mmu_lock);
 		young = kvm_rmap_age_gfn_range(kvm, range, true);
+		write_unlock(&kvm->mmu_lock);
+	}
 
 	if (tdp_mmu_enabled)
 		young |= kvm_tdp_mmu_test_age_gfn(kvm, range);
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index a24fca3f9e7f..f26d0b60d2dd 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -39,10 +39,11 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
 }
 
 /*
- * SPTEs must be modified atomically if they are shadow-present, leaf
- * SPTEs, and have volatile bits, i.e. has bits that can be set outside
- * of mmu_lock.  The Writable bit can be set by KVM's fast page fault
- * handler, and Accessed and Dirty bits can be set by the CPU.
+ * SPTEs must be modified atomically if they have bits that can be set outside
+ * of the mmu_lock. This can happen for any shadow-present leaf SPTEs, as the
+ * Writable bit can be set by KVM's fast page fault handler, the Accessed and
+ * Dirty bits can be set by the CPU, and the Accessed and W/R/X bits can be
+ * cleared by age_gfn_range().
  *
  * Note, non-leaf SPTEs do have Accessed bits and those bits are
  * technically volatile, but KVM doesn't consume the Accessed bit of
@@ -53,8 +54,7 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
 static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
 {
 	return is_shadow_present_pte(old_spte) &&
-	       is_last_spte(old_spte, level) &&
-	       spte_has_volatile_bits(old_spte);
+	       is_last_spte(old_spte, level);
 }
 
 static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 4508d868f1cd..f5b4f1060fff 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -178,6 +178,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
 		     ((_only_valid) && (_root)->role.invalid))) {		\
 		} else
 
+/*
+ * Iterate over all TDP MMU roots in an RCU read-side critical section.
+ */
+#define for_each_valid_tdp_mmu_root_rcu(_kvm, _root, _as_id)			\
+	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link)		\
+		if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) ||	\
+		    (_root)->role.invalid) {					\
+		} else
+
 #define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
 	__for_each_tdp_mmu_root(_kvm, _root, _as_id, false)
 
@@ -1168,16 +1177,16 @@ static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
 	u64 new_spte;
 
 	if (spte_ad_enabled(iter->old_spte)) {
-		iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
-							 iter->old_spte,
-							 shadow_accessed_mask,
-							 iter->level);
+		iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
+						shadow_accessed_mask);
 		new_spte = iter->old_spte & ~shadow_accessed_mask;
 	} else {
 		new_spte = mark_spte_for_access_track(iter->old_spte);
-		iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
-							iter->old_spte, new_spte,
-							iter->level);
+		/*
+		 * It is safe for the following cmpxchg to fail. Leave the
+		 * Accessed bit set, as the spte is most likely young anyway.
+		 */
+		(void)__tdp_mmu_set_spte_atomic(iter, new_spte);
 	}
 
 	trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
-- 
2.47.0.199.ga7371fff76-goog
Re: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
Posted by James Houghton 1 year ago
On Tue, Nov 5, 2024 at 10:43 AM James Houghton <jthoughton@google.com> wrote:
>
>  static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 4508d868f1cd..f5b4f1060fff 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -178,6 +178,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
>                      ((_only_valid) && (_root)->role.invalid))) {               \
>                 } else
>
> +/*
> + * Iterate over all TDP MMU roots in an RCU read-side critical section.
> + */
> +#define for_each_valid_tdp_mmu_root_rcu(_kvm, _root, _as_id)                   \
> +       list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link)         \
> +               if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) ||     \
> +                   (_root)->role.invalid) {                                    \
> +               } else
> +

Venkatesh noticed that this function is unused in this patch. This was
a mistake in the latest rebase. The diff should have been applied:

@@ -1192,15 +1206,15 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
        struct tdp_iter iter;
        bool ret = false;

+       guard(rcu)();
+
        /*
         * Don't support rescheduling, none of the MMU notifiers that funnel
         * into this helper allow blocking; it'd be dead, wasteful code.  Note,
         * this helper must NOT be used to unmap GFNs, as it processes only
         * valid roots!
         */
-       for_each_valid_tdp_mmu_root(kvm, root, range->slot->as_id) {
-               guard(rcu)();
-
+       for_each_valid_tdp_mmu_root_rcu(kvm, root, range->slot->as_id) {
                tdp_root_for_each_leaf_pte(iter, root, range->start,
range->end) {
                        if (!is_accessed_spte(iter.old_spte))
                                continue;

This bug will show up as a LOCKDEP warning on CONFIG_PROVE_RCU_LIST=y
kernels, as list_for_each_entry_rcu() is called outside of an RCU
read-side critical section.
Re: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
Posted by James Houghton 1 year ago
On Mon, Jan 27, 2025 at 11:57 AM James Houghton <jthoughton@google.com> wrote:
>
> On Tue, Nov 5, 2024 at 10:43 AM James Houghton <jthoughton@google.com> wrote:
> >
> >  static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index 4508d868f1cd..f5b4f1060fff 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -178,6 +178,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
> >                      ((_only_valid) && (_root)->role.invalid))) {               \
> >                 } else
> >
> > +/*
> > + * Iterate over all TDP MMU roots in an RCU read-side critical section.
> > + */
> > +#define for_each_valid_tdp_mmu_root_rcu(_kvm, _root, _as_id)                   \
> > +       list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link)         \
> > +               if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) ||     \
> > +                   (_root)->role.invalid) {                                    \
> > +               } else
> > +
>
> Venkatesh noticed that this function is unused in this patch. This was
> a mistake in the latest rebase. The diff should have been applied:
>
> @@ -1192,15 +1206,15 @@ static bool __kvm_tdp_mmu_age_gfn_range(struct kvm *kvm,
>         struct tdp_iter iter;
>         bool ret = false;
>
> +       guard(rcu)();
> +
>         /*
>          * Don't support rescheduling, none of the MMU notifiers that funnel
>          * into this helper allow blocking; it'd be dead, wasteful code.  Note,
>          * this helper must NOT be used to unmap GFNs, as it processes only
>          * valid roots!
>          */
> -       for_each_valid_tdp_mmu_root(kvm, root, range->slot->as_id) {
> -               guard(rcu)();
> -
> +       for_each_valid_tdp_mmu_root_rcu(kvm, root, range->slot->as_id) {
>                 tdp_root_for_each_leaf_pte(iter, root, range->start,
> range->end) {
>                         if (!is_accessed_spte(iter.old_spte))
>                                 continue;
>
> This bug will show up as a LOCKDEP warning on CONFIG_PROVE_RCU_LIST=y
> kernels, as list_for_each_entry_rcu() is called outside of an RCU
> read-side critical section.

There I go lying again. The LOCKDEP warning that will show up is the
lockdep_assert_held_write(mmu_lock) in
kvm_lockdep_assert_mmu_lock_held().

The RCU lockdep WARN would hit if I had used
for_each_valid_tdp_mmu_root_rcu() without moving the RCU lock
acquisition.
Re: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
Posted by Sean Christopherson 1 year ago
On Tue, Nov 05, 2024, James Houghton wrote:
> diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> index a24fca3f9e7f..f26d0b60d2dd 100644
> --- a/arch/x86/kvm/mmu/tdp_iter.h
> +++ b/arch/x86/kvm/mmu/tdp_iter.h
> @@ -39,10 +39,11 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
>  }
>  
>  /*
> - * SPTEs must be modified atomically if they are shadow-present, leaf
> - * SPTEs, and have volatile bits, i.e. has bits that can be set outside
> - * of mmu_lock.  The Writable bit can be set by KVM's fast page fault
> - * handler, and Accessed and Dirty bits can be set by the CPU.
> + * SPTEs must be modified atomically if they have bits that can be set outside
> + * of the mmu_lock. This can happen for any shadow-present leaf SPTEs, as the
> + * Writable bit can be set by KVM's fast page fault handler, the Accessed and
> + * Dirty bits can be set by the CPU, and the Accessed and W/R/X bits can be
> + * cleared by age_gfn_range().
>   *
>   * Note, non-leaf SPTEs do have Accessed bits and those bits are
>   * technically volatile, but KVM doesn't consume the Accessed bit of
> @@ -53,8 +54,7 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
>  static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
>  {
>  	return is_shadow_present_pte(old_spte) &&
> -	       is_last_spte(old_spte, level) &&
> -	       spte_has_volatile_bits(old_spte);
> +	       is_last_spte(old_spte, level);

I don't like this change on multiple fronts.  First and foremost, it results in
spte_has_volatile_bits() being wrong for the TDP MMU.  Second, the same logic
applies to the shadow MMU; the rmap lock prevents a use-after-free of the page
that owns the SPTE, but the zapping of the SPTE happens before the writer grabs
the rmap lock.

Lastly, I'm very, very tempted to say we should omit Accessed state from
spte_has_volatile_bits() and rename it to something like spte_needs_atomic_write().
KVM x86 no longer flushes TLBs on aging, so we're already committed to incorrectly
thinking a page is old in rare cases, for the sake of performance.  The odds of
KVM clobbering the Accessed bit are probably smaller than the odds of missing an
Accessed update due to a stale TLB entry.

Note, only the shadow_accessed_mask check can be removed.  KVM needs to ensure
access-tracked SPTEs are zapped properly, and dirty logging can't have false
negatives.

>  }
>  
>  static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index 4508d868f1cd..f5b4f1060fff 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -178,6 +178,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
>  		     ((_only_valid) && (_root)->role.invalid))) {		\
>  		} else
>  
> +/*
> + * Iterate over all TDP MMU roots in an RCU read-side critical section.

Heh, that's pretty darn obvious.  It would be far more helpful if the comment
explained the usage rules, e.g. what is safe (at a high level).

> + */
> +#define for_each_valid_tdp_mmu_root_rcu(_kvm, _root, _as_id)			\
> +	list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link)		\
> +		if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) ||	\
> +		    (_root)->role.invalid) {					\
> +		} else
> +
>  #define for_each_tdp_mmu_root(_kvm, _root, _as_id)			\
>  	__for_each_tdp_mmu_root(_kvm, _root, _as_id, false)
>  
> @@ -1168,16 +1177,16 @@ static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
>  	u64 new_spte;
>  
>  	if (spte_ad_enabled(iter->old_spte)) {
> -		iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
> -							 iter->old_spte,
> -							 shadow_accessed_mask,
> -							 iter->level);
> +		iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
> +						shadow_accessed_mask);

Align, and let this poke past 80:

		iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
								shadow_accessed_mask);

>  		new_spte = iter->old_spte & ~shadow_accessed_mask;
>  	} else {
>  		new_spte = mark_spte_for_access_track(iter->old_spte);
> -		iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
> -							iter->old_spte, new_spte,
> -							iter->level);
> +		/*
> +		 * It is safe for the following cmpxchg to fail. Leave the
> +		 * Accessed bit set, as the spte is most likely young anyway.
> +		 */
> +		(void)__tdp_mmu_set_spte_atomic(iter, new_spte);

Just a reminder that this needs to be:

		if (__tdp_mmu_set_spte_atomic(iter, new_spte))
			return;

>  	}
>  
>  	trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
> -- 
> 2.47.0.199.ga7371fff76-goog
>
Re: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
Posted by James Houghton 1 year ago
On Fri, Jan 10, 2025 at 2:47 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Tue, Nov 05, 2024, James Houghton wrote:
> > diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
> > index a24fca3f9e7f..f26d0b60d2dd 100644
> > --- a/arch/x86/kvm/mmu/tdp_iter.h
> > +++ b/arch/x86/kvm/mmu/tdp_iter.h
> > @@ -39,10 +39,11 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
> >  }
> >
> >  /*
> > - * SPTEs must be modified atomically if they are shadow-present, leaf
> > - * SPTEs, and have volatile bits, i.e. has bits that can be set outside
> > - * of mmu_lock.  The Writable bit can be set by KVM's fast page fault
> > - * handler, and Accessed and Dirty bits can be set by the CPU.
> > + * SPTEs must be modified atomically if they have bits that can be set outside
> > + * of the mmu_lock. This can happen for any shadow-present leaf SPTEs, as the
> > + * Writable bit can be set by KVM's fast page fault handler, the Accessed and
> > + * Dirty bits can be set by the CPU, and the Accessed and W/R/X bits can be
> > + * cleared by age_gfn_range().
> >   *
> >   * Note, non-leaf SPTEs do have Accessed bits and those bits are
> >   * technically volatile, but KVM doesn't consume the Accessed bit of
> > @@ -53,8 +54,7 @@ static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
> >  static inline bool kvm_tdp_mmu_spte_need_atomic_write(u64 old_spte, int level)
> >  {
> >       return is_shadow_present_pte(old_spte) &&
> > -            is_last_spte(old_spte, level) &&
> > -            spte_has_volatile_bits(old_spte);
> > +            is_last_spte(old_spte, level);
>
> I don't like this change on multiple fronts.  First and foremost, it results in
> spte_has_volatile_bits() being wrong for the TDP MMU.  Second, the same logic
> applies to the shadow MMU; the rmap lock prevents a use-after-free of the page
> that owns the SPTE, but the zapping of the SPTE happens before the writer grabs
> the rmap lock.

Thanks Sean, yes I forgot about the shadow MMU case.

> Lastly, I'm very, very tempted to say we should omit Accessed state from
> spte_has_volatile_bits() and rename it to something like spte_needs_atomic_write().
> KVM x86 no longer flushes TLBs on aging, so we're already committed to incorrectly
> thinking a page is old in rare cases, for the sake of performance.  The odds of
> KVM clobbering the Accessed bit are probably smaller than the odds of missing an
> Accessed update due to a stale TLB entry.
>
> Note, only the shadow_accessed_mask check can be removed.  KVM needs to ensure
> access-tracked SPTEs are zapped properly, and dirty logging can't have false
> negatives.

I've dropped the change to kvm_tdp_mmu_spte_need_atomic_write() and
instead applied this diff.

--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -142,8 +142,14 @@ bool spte_has_volatile_bits(u64 spte)
                return true;

        if (spte_ad_enabled(spte)) {
-               if (!(spte & shadow_accessed_mask) ||
-                   (is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
+               /*
+                * Do not check the Accessed bit. It can be set (by the CPU)
+                * and cleared (by kvm_tdp_mmu_age_spte()) without holding
+                * the mmu_lock, but when clearing the Accessed bit, we do
+                * not invalidate the TLB, so we can already miss Accessed bit
+                * updates.
+                */
+               if (is_writable_pte(spte) && !(spte & shadow_dirty_mask))
                        return true;
        }

I've also included a new patch to rename spte_has_volatile_bits() to
spte_needs_atomic_write() like you suggested. I merely renamed it in
all locations, including documentation; I haven't reworded the
documentation's use of the word "volatile."

>
> >  }
> >
> >  static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
> > diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> > index 4508d868f1cd..f5b4f1060fff 100644
> > --- a/arch/x86/kvm/mmu/tdp_mmu.c
> > +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> > @@ -178,6 +178,15 @@ static struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm,
> >                    ((_only_valid) && (_root)->role.invalid))) {               \
> >               } else
> >
> > +/*
> > + * Iterate over all TDP MMU roots in an RCU read-side critical section.
>
> Heh, that's pretty darn obvious.  It would be far more helpful if the comment
> explained the usage rules, e.g. what is safe (at a high level).

How's this?

+/*
+ * Iterate over all TDP MMU roots in an RCU read-side critical section.
+ * It is safe to iterate over the SPTEs under the root, but their values will
+ * be unstable, so all writes must be atomic. As this routine is meant to be
+ * used without holding the mmu_lock at all, any bits that are flipped must
+ * be reflected in kvm_tdp_mmu_spte_need_atomic_write().
+ */

> > + */
> > +#define for_each_valid_tdp_mmu_root_rcu(_kvm, _root, _as_id)                 \
> > +     list_for_each_entry_rcu(_root, &_kvm->arch.tdp_mmu_roots, link)         \
> > +             if ((_as_id >= 0 && kvm_mmu_page_as_id(_root) != _as_id) ||     \
> > +                 (_root)->role.invalid) {                                    \
> > +             } else
> > +
> >  #define for_each_tdp_mmu_root(_kvm, _root, _as_id)                   \
> >       __for_each_tdp_mmu_root(_kvm, _root, _as_id, false)
> >
> > @@ -1168,16 +1177,16 @@ static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
> >       u64 new_spte;
> >
> >       if (spte_ad_enabled(iter->old_spte)) {
> > -             iter->old_spte = tdp_mmu_clear_spte_bits(iter->sptep,
> > -                                                      iter->old_spte,
> > -                                                      shadow_accessed_mask,
> > -                                                      iter->level);
> > +             iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
> > +                                             shadow_accessed_mask);
>
> Align, and let this poke past 80:
>
>                 iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
>                                                                 shadow_accessed_mask);

Done.

> >               new_spte = iter->old_spte & ~shadow_accessed_mask;
> >       } else {
> >               new_spte = mark_spte_for_access_track(iter->old_spte);
> > -             iter->old_spte = kvm_tdp_mmu_write_spte(iter->sptep,
> > -                                                     iter->old_spte, new_spte,
> > -                                                     iter->level);
> > +             /*
> > +              * It is safe for the following cmpxchg to fail. Leave the
> > +              * Accessed bit set, as the spte is most likely young anyway.
> > +              */
> > +             (void)__tdp_mmu_set_spte_atomic(iter, new_spte);
>
> Just a reminder that this needs to be:
>
>                 if (__tdp_mmu_set_spte_atomic(iter, new_spte))
>                         return;
>

Already applied, thanks!


> >       }
> >
> >       trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
> > --
> > 2.47.0.199.ga7371fff76-goog
> >
Re: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
Posted by kernel test robot 1 year, 3 months ago
Hi James,

kernel test robot noticed the following build warnings:

[auto build test WARNING on a27e0515592ec9ca28e0d027f42568c47b314784]

url:    https://github.com/intel-lab-lkp/linux/commits/James-Houghton/KVM-Remove-kvm_handle_hva_range-helper-functions/20241106-025133
base:   a27e0515592ec9ca28e0d027f42568c47b314784
patch link:    https://lore.kernel.org/r/20241105184333.2305744-5-jthoughton%40google.com
patch subject: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
config: x86_64-rhel-8.3 (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202411061526.RAuCXKJh-lkp@intel.com/

All warnings (new ones prefixed by >>):

   arch/x86/kvm/mmu/tdp_mmu.c: In function 'kvm_tdp_mmu_age_spte':
>> arch/x86/kvm/mmu/tdp_mmu.c:1189:23: warning: ignoring return value of '__tdp_mmu_set_spte_atomic' declared with attribute 'warn_unused_result' [-Wunused-result]
    1189 |                 (void)__tdp_mmu_set_spte_atomic(iter, new_spte);
         |                       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


vim +1189 arch/x86/kvm/mmu/tdp_mmu.c

  1166	
  1167	/*
  1168	 * Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
  1169	 * if any of the GFNs in the range have been accessed.
  1170	 *
  1171	 * No need to mark the corresponding PFN as accessed as this call is coming
  1172	 * from the clear_young() or clear_flush_young() notifier, which uses the
  1173	 * return value to determine if the page has been accessed.
  1174	 */
  1175	static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
  1176	{
  1177		u64 new_spte;
  1178	
  1179		if (spte_ad_enabled(iter->old_spte)) {
  1180			iter->old_spte = tdp_mmu_clear_spte_bits_atomic(iter->sptep,
  1181							shadow_accessed_mask);
  1182			new_spte = iter->old_spte & ~shadow_accessed_mask;
  1183		} else {
  1184			new_spte = mark_spte_for_access_track(iter->old_spte);
  1185			/*
  1186			 * It is safe for the following cmpxchg to fail. Leave the
  1187			 * Accessed bit set, as the spte is most likely young anyway.
  1188			 */
> 1189			(void)__tdp_mmu_set_spte_atomic(iter, new_spte);
  1190		}
  1191	
  1192		trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
  1193					       iter->old_spte, new_spte);
  1194	}
  1195	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
Posted by James Houghton 1 year, 3 months ago
On Wed, Nov 6, 2024 at 3:22 AM kernel test robot <lkp@intel.com> wrote:
>
> Hi James,
>
> kernel test robot noticed the following build warnings:
>
> [auto build test WARNING on a27e0515592ec9ca28e0d027f42568c47b314784]
>
> url:    https://github.com/intel-lab-lkp/linux/commits/James-Houghton/KVM-Remove-kvm_handle_hva_range-helper-functions/20241106-025133
> base:   a27e0515592ec9ca28e0d027f42568c47b314784
> patch link:    https://lore.kernel.org/r/20241105184333.2305744-5-jthoughton%40google.com
> patch subject: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
> config: x86_64-rhel-8.3 (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/config)
> compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202411061526.RAuCXKJh-lkp@intel.com/
>
> All warnings (new ones prefixed by >>):
>
>    arch/x86/kvm/mmu/tdp_mmu.c: In function 'kvm_tdp_mmu_age_spte':
> >> arch/x86/kvm/mmu/tdp_mmu.c:1189:23: warning: ignoring return value of '__tdp_mmu_set_spte_atomic' declared with attribute 'warn_unused_result' [-Wunused-result]
>     1189 |                 (void)__tdp_mmu_set_spte_atomic(iter, new_spte);
>          |                       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>

Well, I saw this compiler warning in my latest rebase and thought the
`(void)` would fix it. I guess the next best way to fix it would be to
assign to an `int __maybe_unused`. I'll do for a v9, or Sean if you're
going to take the series (maybe? :)), go ahead and apply whatever fix
you like.
Re: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
Posted by Sean Christopherson 1 year, 3 months ago
On Thu, Nov 07, 2024, James Houghton wrote:
> On Wed, Nov 6, 2024 at 3:22 AM kernel test robot <lkp@intel.com> wrote:
> >
> > Hi James,
> >
> > kernel test robot noticed the following build warnings:
> >
> > [auto build test WARNING on a27e0515592ec9ca28e0d027f42568c47b314784]
> >
> > url:    https://github.com/intel-lab-lkp/linux/commits/James-Houghton/KVM-Remove-kvm_handle_hva_range-helper-functions/20241106-025133
> > base:   a27e0515592ec9ca28e0d027f42568c47b314784
> > patch link:    https://lore.kernel.org/r/20241105184333.2305744-5-jthoughton%40google.com
> > patch subject: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
> > config: x86_64-rhel-8.3 (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/config)
> > compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
> > reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/reproduce)
> >
> > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > the same patch/commit), kindly add following tags
> > | Reported-by: kernel test robot <lkp@intel.com>
> > | Closes: https://lore.kernel.org/oe-kbuild-all/202411061526.RAuCXKJh-lkp@intel.com/
> >
> > All warnings (new ones prefixed by >>):
> >
> >    arch/x86/kvm/mmu/tdp_mmu.c: In function 'kvm_tdp_mmu_age_spte':
> > >> arch/x86/kvm/mmu/tdp_mmu.c:1189:23: warning: ignoring return value of '__tdp_mmu_set_spte_atomic' declared with attribute 'warn_unused_result' [-Wunused-result]
> >     1189 |                 (void)__tdp_mmu_set_spte_atomic(iter, new_spte);
> >          |                       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> >
> 
> Well, I saw this compiler warning in my latest rebase and thought the
> `(void)` would fix it. I guess the next best way to fix it would be to
> assign to an `int __maybe_unused`. I'll do for a v9, or Sean if you're
> going to take the series (maybe? :)), go ahead and apply whatever fix
> you like.

Heh, actually, the compiler is correct.  Ignoring the return value is a bug.
KVM should instead return immediately, as falling through to the tracepoint will
log bogus information.  E.g. will show a !PRESENT SPTE, instead of whatever the
current SPTE actually is (iter->old_spte will have been updating to the current
value of the SPTE).

	trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
				       iter->old_spte, new_spte);

diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index f5b4f1060fff..cc8ae998b7c8 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1186,7 +1186,8 @@ static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
                 * It is safe for the following cmpxchg to fail. Leave the
                 * Accessed bit set, as the spte is most likely young anyway.
                 */
-               (void)__tdp_mmu_set_spte_atomic(iter, new_spte);
+               if (__tdp_mmu_set_spte_atomic(iter, new_spte))
+                       return;
        }
 
        trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
Re: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
Posted by James Houghton 1 year, 2 months ago
On Fri, Nov 8, 2024 at 5:45 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Nov 07, 2024, James Houghton wrote:
> > On Wed, Nov 6, 2024 at 3:22 AM kernel test robot <lkp@intel.com> wrote:
> > >
> > > Hi James,
> > >
> > > kernel test robot noticed the following build warnings:
> > >
> > > [auto build test WARNING on a27e0515592ec9ca28e0d027f42568c47b314784]
> > >
> > > url:    https://github.com/intel-lab-lkp/linux/commits/James-Houghton/KVM-Remove-kvm_handle_hva_range-helper-functions/20241106-025133
> > > base:   a27e0515592ec9ca28e0d027f42568c47b314784
> > > patch link:    https://lore.kernel.org/r/20241105184333.2305744-5-jthoughton%40google.com
> > > patch subject: [PATCH v8 04/11] KVM: x86/mmu: Relax locking for kvm_test_age_gfn and kvm_age_gfn
> > > config: x86_64-rhel-8.3 (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/config)
> > > compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
> > > reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241106/202411061526.RAuCXKJh-lkp@intel.com/reproduce)
> > >
> > > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > > the same patch/commit), kindly add following tags
> > > | Reported-by: kernel test robot <lkp@intel.com>
> > > | Closes: https://lore.kernel.org/oe-kbuild-all/202411061526.RAuCXKJh-lkp@intel.com/
> > >
> > > All warnings (new ones prefixed by >>):
> > >
> > >    arch/x86/kvm/mmu/tdp_mmu.c: In function 'kvm_tdp_mmu_age_spte':
> > > >> arch/x86/kvm/mmu/tdp_mmu.c:1189:23: warning: ignoring return value of '__tdp_mmu_set_spte_atomic' declared with attribute 'warn_unused_result' [-Wunused-result]
> > >     1189 |                 (void)__tdp_mmu_set_spte_atomic(iter, new_spte);
> > >          |                       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> > >
> >
> > Well, I saw this compiler warning in my latest rebase and thought the
> > `(void)` would fix it. I guess the next best way to fix it would be to
> > assign to an `int __maybe_unused`. I'll do for a v9, or Sean if you're
> > going to take the series (maybe? :)), go ahead and apply whatever fix
> > you like.
>
> Heh, actually, the compiler is correct.  Ignoring the return value is a bug.
> KVM should instead return immediately, as falling through to the tracepoint will
> log bogus information.  E.g. will show a !PRESENT SPTE, instead of whatever the
> current SPTE actually is (iter->old_spte will have been updating to the current
> value of the SPTE).
>
>         trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
>                                        iter->old_spte, new_spte);
>
> diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
> index f5b4f1060fff..cc8ae998b7c8 100644
> --- a/arch/x86/kvm/mmu/tdp_mmu.c
> +++ b/arch/x86/kvm/mmu/tdp_mmu.c
> @@ -1186,7 +1186,8 @@ static void kvm_tdp_mmu_age_spte(struct tdp_iter *iter)
>                  * It is safe for the following cmpxchg to fail. Leave the
>                  * Accessed bit set, as the spte is most likely young anyway.
>                  */
> -               (void)__tdp_mmu_set_spte_atomic(iter, new_spte);
> +               if (__tdp_mmu_set_spte_atomic(iter, new_spte))
> +                       return;
>         }
>
>         trace_kvm_tdp_mmu_spte_changed(iter->as_id, iter->gfn, iter->level,
>

Oh yes, you're right. Thanks Sean! The diff LGTM.