KVM: selftests: Ensure all vCPUs hit -EFAULT during initial RO stage

[PATCH] KVM: selftests: Ensure all vCPUs hit -EFAULT during initial RO stage

Posted by Sean Christopherson 11 months, 1 week ago

During the initial mprotect(RO) stage of mmu_stress_test, keep vCPUs
spinning until all vCPUs have hit -EFAULT, i.e. until all vCPUs have tried
to write to a read-only page.  If a vCPU manages to complete an entire
iteration of the loop without hitting a read-only page, *and* the vCPU
observes mprotect_ro_done before starting a second iteration, then the
vCPU will prematurely fall through to GUEST_SYNC(3) (on x86 and arm64) and
get out of sequence.

Replace the "do-while (!r)" loop around the associated _vcpu_run() with
a single invocation, as barring a KVM bug, the vCPU is guaranteed to hit
-EFAULT, and retrying on success is super confusion, hides KVM bugs, and
complicates this fix.  The do-while loop was semi-unintentionally added
specifically to fudge around a KVM x86 bug, and said bug is unhittable
without modifying the test to force x86 down the !(x86||arm64) path.

On x86, if forced emulation is enabled, vcpu_arch_put_guest() may trigger
emulation of the store to memory.  Due a (very, very) longstanding bug in
KVM x86's emulator, emulate writes to guest memory that fail during
__kvm_write_guest_page() unconditionally return KVM_EXIT_MMIO.  While that
is desirable in the !memslot case, it's wrong in this case as the failure
happens due to __copy_to_user() hitting a read-only page, not an emulated
MMIO region.

But as above, x86 only uses vcpu_arch_put_guest() if the __x86_64__ guards
are clobbered to force x86 down the common path, and of course the
unexpected MMIO is a KVM bug, i.e. *should* cause a test failure.

Reported-by: Yan Zhao <yan.y.zhao@intel.com>
Debugged-by: Yan Zhao <yan.y.zhao@intel.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/mmu_stress_test.c | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c
index d9c76b4c0d88..6a437d2be9fa 100644
--- a/tools/testing/selftests/kvm/mmu_stress_test.c
+++ b/tools/testing/selftests/kvm/mmu_stress_test.c
@@ -18,6 +18,7 @@
 #include "ucall_common.h"
 
 static bool mprotect_ro_done;
+static bool all_vcpus_hit_ro_fault;
 
 static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
 {
@@ -36,9 +37,9 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
 
 	/*
 	 * Write to the region while mprotect(PROT_READ) is underway.  Keep
-	 * looping until the memory is guaranteed to be read-only, otherwise
-	 * vCPUs may complete their writes and advance to the next stage
-	 * prematurely.
+	 * looping until the memory is guaranteed to be read-only and a fault
+	 * has occurred, otherwise vCPUs may complete their writes and advance
+	 * to the next stage prematurely.
 	 *
 	 * For architectures that support skipping the faulting instruction,
 	 * generate the store via inline assembly to ensure the exact length
@@ -56,7 +57,7 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
 #else
 			vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
 #endif
-	} while (!READ_ONCE(mprotect_ro_done));
+	} while (!READ_ONCE(mprotect_ro_done) || !READ_ONCE(all_vcpus_hit_ro_fault));
 
 	/*
 	 * Only architectures that write the entire range can explicitly sync,
@@ -81,6 +82,7 @@ struct vcpu_info {
 
 static int nr_vcpus;
 static atomic_t rendezvous;
+static atomic_t nr_ro_faults;
 
 static void rendezvous_with_boss(void)
 {
@@ -148,12 +150,16 @@ static void *vcpu_worker(void *data)
 	 * be stuck on the faulting instruction for other architectures.  Go to
 	 * stage 3 without a rendezvous
 	 */
-	do {
-		r = _vcpu_run(vcpu);
-	} while (!r);
+	r = _vcpu_run(vcpu);
 	TEST_ASSERT(r == -1 && errno == EFAULT,
 		    "Expected EFAULT on write to RO memory, got r = %d, errno = %d", r, errno);
 
+	atomic_inc(&nr_ro_faults);
+	if (atomic_read(&nr_ro_faults) == nr_vcpus) {
+		WRITE_ONCE(all_vcpus_hit_ro_fault, true);
+		sync_global_to_guest(vm, all_vcpus_hit_ro_fault);
+	}
+
 #if defined(__x86_64__) || defined(__aarch64__)
 	/*
 	 * Verify *all* writes from the guest hit EFAULT due to the VMA now
@@ -378,7 +384,6 @@ int main(int argc, char *argv[])
 	rendezvous_with_vcpus(&time_run2, "run 2");
 
 	mprotect(mem, slot_size, PROT_READ);
-	usleep(10);
 	mprotect_ro_done = true;
 	sync_global_to_guest(vm, mprotect_ro_done);
 

base-commit: 557953f8b75fce49dc65f9b0f7e811c060fc7860
-- 
2.48.1.711.g2feabab25a-goog

Re: [PATCH] KVM: selftests: Ensure all vCPUs hit -EFAULT during initial RO stage

Posted by Sean Christopherson 11 months, 1 week ago

On Fri, 28 Feb 2025 15:08:04 -0800, Sean Christopherson wrote:
> During the initial mprotect(RO) stage of mmu_stress_test, keep vCPUs
> spinning until all vCPUs have hit -EFAULT, i.e. until all vCPUs have tried
> to write to a read-only page.  If a vCPU manages to complete an entire
> iteration of the loop without hitting a read-only page, *and* the vCPU
> observes mprotect_ro_done before starting a second iteration, then the
> vCPU will prematurely fall through to GUEST_SYNC(3) (on x86 and arm64) and
> get out of sequence.
> 
> [...]

Applied to kvm-x86 fixes, with Fixes and Closes.  Thanks!

[1/1] KVM: selftests: Ensure all vCPUs hit -EFAULT during initial RO stage
      https://github.com/kvm-x86/linux/commit/d88ed5fb7c88

--
https://github.com/kvm-x86/linux/tree/next

Re: [PATCH] KVM: selftests: Ensure all vCPUs hit -EFAULT during initial RO stage

Posted by Yan Zhao 11 months, 1 week ago

On Fri, Feb 28, 2025 at 03:08:04PM -0800, Sean Christopherson wrote:
> During the initial mprotect(RO) stage of mmu_stress_test, keep vCPUs
> spinning until all vCPUs have hit -EFAULT, i.e. until all vCPUs have tried
> to write to a read-only page.  If a vCPU manages to complete an entire
> iteration of the loop without hitting a read-only page, *and* the vCPU
> observes mprotect_ro_done before starting a second iteration, then the
> vCPU will prematurely fall through to GUEST_SYNC(3) (on x86 and arm64) and
> get out of sequence.
> 
> Replace the "do-while (!r)" loop around the associated _vcpu_run() with
> a single invocation, as barring a KVM bug, the vCPU is guaranteed to hit
> -EFAULT, and retrying on success is super confusion, hides KVM bugs, and
> complicates this fix.  The do-while loop was semi-unintentionally added
> specifically to fudge around a KVM x86 bug, and said bug is unhittable
> without modifying the test to force x86 down the !(x86||arm64) path.
> 
> On x86, if forced emulation is enabled, vcpu_arch_put_guest() may trigger
> emulation of the store to memory.  Due a (very, very) longstanding bug in
> KVM x86's emulator, emulate writes to guest memory that fail during
> __kvm_write_guest_page() unconditionally return KVM_EXIT_MMIO.  While that
> is desirable in the !memslot case, it's wrong in this case as the failure
> happens due to __copy_to_user() hitting a read-only page, not an emulated
> MMIO region.
> 
> But as above, x86 only uses vcpu_arch_put_guest() if the __x86_64__ guards
> are clobbered to force x86 down the common path, and of course the
> unexpected MMIO is a KVM bug, i.e. *should* cause a test failure.
> 
> Reported-by: Yan Zhao <yan.y.zhao@intel.com>
> Debugged-by: Yan Zhao <yan.y.zhao@intel.com>
> Signed-off-by: Sean Christopherson <seanjc@google.com>
Nit: consider adding Closes and Fixes tags.

Reviewed-by: Yan Zhao <yan.y.zhao@intel.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>

> ---
>  tools/testing/selftests/kvm/mmu_stress_test.c | 21 ++++++++++++-------
>  1 file changed, 13 insertions(+), 8 deletions(-)
> 
> diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c
> index d9c76b4c0d88..6a437d2be9fa 100644
> --- a/tools/testing/selftests/kvm/mmu_stress_test.c
> +++ b/tools/testing/selftests/kvm/mmu_stress_test.c
> @@ -18,6 +18,7 @@
>  #include "ucall_common.h"
>  
>  static bool mprotect_ro_done;
> +static bool all_vcpus_hit_ro_fault;
>  
>  static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
>  {
> @@ -36,9 +37,9 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
>  
>  	/*
>  	 * Write to the region while mprotect(PROT_READ) is underway.  Keep
> -	 * looping until the memory is guaranteed to be read-only, otherwise
> -	 * vCPUs may complete their writes and advance to the next stage
> -	 * prematurely.
> +	 * looping until the memory is guaranteed to be read-only and a fault
> +	 * has occurred, otherwise vCPUs may complete their writes and advance
> +	 * to the next stage prematurely.
>  	 *
>  	 * For architectures that support skipping the faulting instruction,
>  	 * generate the store via inline assembly to ensure the exact length
> @@ -56,7 +57,7 @@ static void guest_code(uint64_t start_gpa, uint64_t end_gpa, uint64_t stride)
>  #else
>  			vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
>  #endif
> -	} while (!READ_ONCE(mprotect_ro_done));
> +	} while (!READ_ONCE(mprotect_ro_done) || !READ_ONCE(all_vcpus_hit_ro_fault));
>  
>  	/*
>  	 * Only architectures that write the entire range can explicitly sync,
> @@ -81,6 +82,7 @@ struct vcpu_info {
>  
>  static int nr_vcpus;
>  static atomic_t rendezvous;
> +static atomic_t nr_ro_faults;
>  
>  static void rendezvous_with_boss(void)
>  {
> @@ -148,12 +150,16 @@ static void *vcpu_worker(void *data)
>  	 * be stuck on the faulting instruction for other architectures.  Go to
>  	 * stage 3 without a rendezvous
>  	 */
> -	do {
> -		r = _vcpu_run(vcpu);
> -	} while (!r);
> +	r = _vcpu_run(vcpu);
>  	TEST_ASSERT(r == -1 && errno == EFAULT,
>  		    "Expected EFAULT on write to RO memory, got r = %d, errno = %d", r, errno);
>  
> +	atomic_inc(&nr_ro_faults);
> +	if (atomic_read(&nr_ro_faults) == nr_vcpus) {
> +		WRITE_ONCE(all_vcpus_hit_ro_fault, true);
> +		sync_global_to_guest(vm, all_vcpus_hit_ro_fault);
> +	}
> +
>  #if defined(__x86_64__) || defined(__aarch64__)
>  	/*
>  	 * Verify *all* writes from the guest hit EFAULT due to the VMA now
> @@ -378,7 +384,6 @@ int main(int argc, char *argv[])
>  	rendezvous_with_vcpus(&time_run2, "run 2");
>  
>  	mprotect(mem, slot_size, PROT_READ);
> -	usleep(10);
>  	mprotect_ro_done = true;
>  	sync_global_to_guest(vm, mprotect_ro_done);
>  
> 
> base-commit: 557953f8b75fce49dc65f9b0f7e811c060fc7860
> -- 
> 2.48.1.711.g2feabab25a-goog
>