[PATCH 25/28] KVM: nSVM: Access MSRPM in 4-byte chunks only for merging L0 and L1 bitmaps

Sean Christopherson posted 28 patches 8 months, 2 weeks ago
There is a newer version of this series
[PATCH 25/28] KVM: nSVM: Access MSRPM in 4-byte chunks only for merging L0 and L1 bitmaps
Posted by Sean Christopherson 8 months, 2 weeks ago
Access the MSRPM using u32/4-byte chunks (and appropriately adjusted
offsets) only when merging L0 and L1 bitmaps as part of emulating VMRUN.
The only reason to batch accesses to MSRPMs is to avoid the overhead of
uaccess operations (e.g. STAC/CLAC and bounds checks) when reading L1's
bitmap pointed at by vmcb12.  For all other uses, either per-bit accesses
are more than fast enough (no uaccess), or KVM is only accessing a single
bit (nested_svm_exit_handled_msr()) and so there's nothing to batch.

In addition to (hopefully) documenting the uniqueness of the merging code,
restricting chunked access to _just_ the merging code will allow for
increasing the chunk size (to unsigned long) with minimal risk.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/nested.c | 50 ++++++++++++++++-----------------------
 arch/x86/kvm/svm/svm.h    | 18 ++++++++++----
 2 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index e07e10fb52a5..a4e98ada732b 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -187,31 +187,19 @@ void recalc_intercepts(struct vcpu_svm *svm)
 static int nested_svm_msrpm_merge_offsets[6] __ro_after_init;
 static int nested_svm_nr_msrpm_merge_offsets __ro_after_init;
 
-static const u32 msrpm_ranges[] = {
-	SVM_MSRPM_RANGE_0_BASE_MSR,
-	SVM_MSRPM_RANGE_1_BASE_MSR,
-	SVM_MSRPM_RANGE_2_BASE_MSR
-};
+#define SVM_BUILD_MSR_BYTE_NR_CASE(range_nr, msr)				\
+	case SVM_MSRPM_FIRST_MSR(range_nr) ... SVM_MSRPM_LAST_MSR(range_nr):	\
+		return SVM_MSRPM_BYTE_NR(range_nr, msr);
 
 static u32 svm_msrpm_offset(u32 msr)
 {
-	u32 offset;
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(msrpm_ranges); i++) {
-		if (msr < msrpm_ranges[i] ||
-		    msr >= msrpm_ranges[i] + SVM_MSRS_PER_RANGE)
-			continue;
-
-		offset  = (msr - msrpm_ranges[i]) / SVM_MSRS_PER_BYTE;
-		offset += (i * SVM_MSRPM_BYTES_PER_RANGE);  /* add range offset */
-
-		/* Now we have the u8 offset - but need the u32 offset */
-		return offset / 4;
+	switch (msr) {
+	SVM_BUILD_MSR_BYTE_NR_CASE(0, msr)
+	SVM_BUILD_MSR_BYTE_NR_CASE(1, msr)
+	SVM_BUILD_MSR_BYTE_NR_CASE(2, msr)
+	default:
+		return MSR_INVALID;
 	}
-
-	/* MSR not in any range */
-	return MSR_INVALID;
 }
 
 int __init nested_svm_init_msrpm_merge_offsets(void)
@@ -245,6 +233,12 @@ int __init nested_svm_init_msrpm_merge_offsets(void)
 		if (WARN_ON(offset == MSR_INVALID))
 			return -EIO;
 
+		/*
+		 * Merging is done in 32-bit chunks to reduce the number of
+		 * accesses to L1's bitmap.
+		 */
+		offset /= sizeof(u32);
+
 		for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) {
 			if (nested_svm_msrpm_merge_offsets[j] == offset)
 				break;
@@ -1363,8 +1357,9 @@ void svm_leave_nested(struct kvm_vcpu *vcpu)
 
 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 {
-	u32 offset, msr, value;
-	int write, mask;
+	u32 offset, msr;
+	int write;
+	u8 value;
 
 	if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
 		return NESTED_EXIT_HOST;
@@ -1372,18 +1367,15 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
 	msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
 	offset = svm_msrpm_offset(msr);
 	write  = svm->vmcb->control.exit_info_1 & 1;
-	mask   = 1 << ((2 * (msr & 0xf)) + write);
 
 	if (offset == MSR_INVALID)
 		return NESTED_EXIT_DONE;
 
-	/* Offset is in 32 bit units but need in 8 bit units */
-	offset *= 4;
-
-	if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4))
+	if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset,
+				&value, sizeof(value)))
 		return NESTED_EXIT_DONE;
 
-	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
+	return (value & BIT(write)) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
 }
 
 static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 77287c870967..155b6089fcd2 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -634,15 +634,23 @@ static_assert(SVM_MSRS_PER_RANGE == 8192);
 	(range_nr * SVM_MSRPM_BYTES_PER_RANGE * BITS_PER_BYTE +			\
 	 (msr - SVM_MSRPM_RANGE_## range_nr ##_BASE_MSR) * SVM_BITS_PER_MSR)
 
-#define SVM_MSRPM_SANITY_CHECK_BITS(range_nr)					\
+#define SVM_MSRPM_BYTE_NR(range_nr, msr)					\
+	(range_nr * SVM_MSRPM_BYTES_PER_RANGE +					\
+	 (msr - SVM_MSRPM_RANGE_## range_nr ##_BASE_MSR) / SVM_MSRS_PER_BYTE)
+
+#define SVM_MSRPM_SANITY_CHECK_BITS_AND_BYTES(range_nr)				\
 static_assert(SVM_MSRPM_BIT_NR(range_nr, SVM_MSRPM_FIRST_MSR(range_nr) + 1) ==	\
 	      range_nr * 2048 * 8 + 2);						\
 static_assert(SVM_MSRPM_BIT_NR(range_nr, SVM_MSRPM_FIRST_MSR(range_nr) + 7) ==	\
-	      range_nr * 2048 * 8 + 14);
+	      range_nr * 2048 * 8 + 14);					\
+static_assert(SVM_MSRPM_BYTE_NR(range_nr, SVM_MSRPM_FIRST_MSR(range_nr) + 1) ==	\
+	      range_nr * 2048);							\
+static_assert(SVM_MSRPM_BYTE_NR(range_nr, SVM_MSRPM_FIRST_MSR(range_nr) + 7) ==	\
+	      range_nr * 2048 + 1);
 
-SVM_MSRPM_SANITY_CHECK_BITS(0);
-SVM_MSRPM_SANITY_CHECK_BITS(1);
-SVM_MSRPM_SANITY_CHECK_BITS(2);
+SVM_MSRPM_SANITY_CHECK_BITS_AND_BYTES(0);
+SVM_MSRPM_SANITY_CHECK_BITS_AND_BYTES(1);
+SVM_MSRPM_SANITY_CHECK_BITS_AND_BYTES(2);
 
 #define SVM_BUILD_MSR_BITMAP_CASE(bitmap, range_nr, msr, bitop, bit_rw)		\
 	case SVM_MSRPM_FIRST_MSR(range_nr) ... SVM_MSRPM_LAST_MSR(range_nr):	\
-- 
2.49.0.1204.g71687c7c1d-goog
Re: [PATCH 25/28] KVM: nSVM: Access MSRPM in 4-byte chunks only for merging L0 and L1 bitmaps
Posted by Paolo Bonzini 8 months, 1 week ago
On 5/30/25 01:40, Sean Christopherson wrote:
> Access the MSRPM using u32/4-byte chunks (and appropriately adjusted
> offsets) only when merging L0 and L1 bitmaps as part of emulating VMRUN.
> The only reason to batch accesses to MSRPMs is to avoid the overhead of
> uaccess operations (e.g. STAC/CLAC and bounds checks) when reading L1's
> bitmap pointed at by vmcb12.  For all other uses, either per-bit accesses
> are more than fast enough (no uaccess), or KVM is only accessing a single
> bit (nested_svm_exit_handled_msr()) and so there's nothing to batch.
> 
> In addition to (hopefully) documenting the uniqueness of the merging code,
> restricting chunked access to _just_ the merging code will allow for
> increasing the chunk size (to unsigned long) with minimal risk.
> 
> Signed-off-by: Sean Christopherson <seanjc@google.com>
> ---
>   arch/x86/kvm/svm/nested.c | 50 ++++++++++++++++-----------------------
>   arch/x86/kvm/svm/svm.h    | 18 ++++++++++----
>   2 files changed, 34 insertions(+), 34 deletions(-)
> 
> diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
> index e07e10fb52a5..a4e98ada732b 100644
> --- a/arch/x86/kvm/svm/nested.c
> +++ b/arch/x86/kvm/svm/nested.c
> @@ -187,31 +187,19 @@ void recalc_intercepts(struct vcpu_svm *svm)
>   static int nested_svm_msrpm_merge_offsets[6] __ro_after_init;
>   static int nested_svm_nr_msrpm_merge_offsets __ro_after_init;
>   
> -static const u32 msrpm_ranges[] = {
> -	SVM_MSRPM_RANGE_0_BASE_MSR,
> -	SVM_MSRPM_RANGE_1_BASE_MSR,
> -	SVM_MSRPM_RANGE_2_BASE_MSR
> -};
> +#define SVM_BUILD_MSR_BYTE_NR_CASE(range_nr, msr)				\
> +	case SVM_MSRPM_FIRST_MSR(range_nr) ... SVM_MSRPM_LAST_MSR(range_nr):	\
> +		return SVM_MSRPM_BYTE_NR(range_nr, msr);
>   
>   static u32 svm_msrpm_offset(u32 msr)
>   {
> -	u32 offset;
> -	int i;
> -
> -	for (i = 0; i < ARRAY_SIZE(msrpm_ranges); i++) {
> -		if (msr < msrpm_ranges[i] ||
> -		    msr >= msrpm_ranges[i] + SVM_MSRS_PER_RANGE)
> -			continue;
> -
> -		offset  = (msr - msrpm_ranges[i]) / SVM_MSRS_PER_BYTE;
> -		offset += (i * SVM_MSRPM_BYTES_PER_RANGE);  /* add range offset */
> -
> -		/* Now we have the u8 offset - but need the u32 offset */
> -		return offset / 4;
> +	switch (msr) {
> +	SVM_BUILD_MSR_BYTE_NR_CASE(0, msr)
> +	SVM_BUILD_MSR_BYTE_NR_CASE(1, msr)
> +	SVM_BUILD_MSR_BYTE_NR_CASE(2, msr)
> +	default:
> +		return MSR_INVALID;
>   	}
> -
> -	/* MSR not in any range */
> -	return MSR_INVALID;
>   }
>   
>   int __init nested_svm_init_msrpm_merge_offsets(void)
> @@ -245,6 +233,12 @@ int __init nested_svm_init_msrpm_merge_offsets(void)
>   		if (WARN_ON(offset == MSR_INVALID))
>   			return -EIO;
>   
> +		/*
> +		 * Merging is done in 32-bit chunks to reduce the number of
> +		 * accesses to L1's bitmap.
> +		 */
> +		offset /= sizeof(u32);
> +
>   		for (j = 0; j < nested_svm_nr_msrpm_merge_offsets; j++) {
>   			if (nested_svm_msrpm_merge_offsets[j] == offset)
>   				break;
> @@ -1363,8 +1357,9 @@ void svm_leave_nested(struct kvm_vcpu *vcpu)
>   
>   static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
>   {
> -	u32 offset, msr, value;
> -	int write, mask;
> +	u32 offset, msr;
> +	int write;
> +	u8 value;
>   
>   	if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
>   		return NESTED_EXIT_HOST;
> @@ -1372,18 +1367,15 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
>   	msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
>   	offset = svm_msrpm_offset(msr);
>   	write  = svm->vmcb->control.exit_info_1 & 1;
> -	mask   = 1 << ((2 * (msr & 0xf)) + write);

This is wrong.  The bit to read isn't always bit 0 or bit 1, therefore 
mask needs to remain.  But it can be written easily as:

    	msr = svm->vcpu.arch.regs[VCPU_REGS_RCX];
	write = svm->vmcb->control.exit_info_1 & 1;

	bit = svm_msrpm_bit(msr);
	if (bit == MSR_INVALID)
		return NESTED_EXIT_DONE;
	offset = bit / BITS_PER_BYTE;
	mask = BIT(write) << (bit & (BITS_PER_BYTE - 1));

and this even removes the need to use svm_msrpm_offset() in 
nested_svm_exit_handled_msr().


At this point, it may even make sense to keep the adjustment for the 
offset in svm_msrpm_offset(), like this:

static u32 svm_msrpm_offset(u32 msr)
{
	u32 bit = svm_msr_bit(msr);
	if (bit == MSR_INVALID)
		return MSR_INVALID;

	/*
	 * Merging is done in 32-bit chunks to reduce the number of
	 * accesses to L1's bitmap.
	 */
	return bit / (BITS_PER_BYTE * sizeof(u32));
}

I'll let you be the judge on this.

Paolo

>   	if (offset == MSR_INVALID)
>   		return NESTED_EXIT_DONE;
>   
> -	/* Offset is in 32 bit units but need in 8 bit units */
> -	offset *= 4;
> -
> -	if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4))
> +	if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset,
> +				&value, sizeof(value)))
>   		return NESTED_EXIT_DONE;
>   
> -	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
> +	return (value & BIT(write)) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
>   }
>   
>   static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 77287c870967..155b6089fcd2 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -634,15 +634,23 @@ static_assert(SVM_MSRS_PER_RANGE == 8192);
>   	(range_nr * SVM_MSRPM_BYTES_PER_RANGE * BITS_PER_BYTE +			\
>   	 (msr - SVM_MSRPM_RANGE_## range_nr ##_BASE_MSR) * SVM_BITS_PER_MSR)
>   
> -#define SVM_MSRPM_SANITY_CHECK_BITS(range_nr)					\
> +#define SVM_MSRPM_BYTE_NR(range_nr, msr)					\
> +	(range_nr * SVM_MSRPM_BYTES_PER_RANGE +					\
> +	 (msr - SVM_MSRPM_RANGE_## range_nr ##_BASE_MSR) / SVM_MSRS_PER_BYTE)
> +
> +#define SVM_MSRPM_SANITY_CHECK_BITS_AND_BYTES(range_nr)				\
>   static_assert(SVM_MSRPM_BIT_NR(range_nr, SVM_MSRPM_FIRST_MSR(range_nr) + 1) ==	\
>   	      range_nr * 2048 * 8 + 2);						\
>   static_assert(SVM_MSRPM_BIT_NR(range_nr, SVM_MSRPM_FIRST_MSR(range_nr) + 7) ==	\
> -	      range_nr * 2048 * 8 + 14);
> +	      range_nr * 2048 * 8 + 14);					\
> +static_assert(SVM_MSRPM_BYTE_NR(range_nr, SVM_MSRPM_FIRST_MSR(range_nr) + 1) ==	\
> +	      range_nr * 2048);							\
> +static_assert(SVM_MSRPM_BYTE_NR(range_nr, SVM_MSRPM_FIRST_MSR(range_nr) + 7) ==	\
> +	      range_nr * 2048 + 1);
>   
> -SVM_MSRPM_SANITY_CHECK_BITS(0);
> -SVM_MSRPM_SANITY_CHECK_BITS(1);
> -SVM_MSRPM_SANITY_CHECK_BITS(2);
> +SVM_MSRPM_SANITY_CHECK_BITS_AND_BYTES(0);
> +SVM_MSRPM_SANITY_CHECK_BITS_AND_BYTES(1);
> +SVM_MSRPM_SANITY_CHECK_BITS_AND_BYTES(2);
>   
>   #define SVM_BUILD_MSR_BITMAP_CASE(bitmap, range_nr, msr, bitop, bit_rw)		\
>   	case SVM_MSRPM_FIRST_MSR(range_nr) ... SVM_MSRPM_LAST_MSR(range_nr):	\
Re: [PATCH 25/28] KVM: nSVM: Access MSRPM in 4-byte chunks only for merging L0 and L1 bitmaps
Posted by Sean Christopherson 8 months, 1 week ago
On Wed, Jun 04, 2025, Paolo Bonzini wrote:
> On 5/30/25 01:40, Sean Christopherson wrote:
> > @@ -1363,8 +1357,9 @@ void svm_leave_nested(struct kvm_vcpu *vcpu)
> >   static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
> >   {
> > -	u32 offset, msr, value;
> > -	int write, mask;
> > +	u32 offset, msr;
> > +	int write;
> > +	u8 value;
> >   	if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
> >   		return NESTED_EXIT_HOST;
> > @@ -1372,18 +1367,15 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
> >   	msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
> >   	offset = svm_msrpm_offset(msr);
> >   	write  = svm->vmcb->control.exit_info_1 & 1;
> > -	mask   = 1 << ((2 * (msr & 0xf)) + write);
> 
> This is wrong.  The bit to read isn't always bit 0 or bit 1, therefore mask
> needs to remain.

/facepalm

Duh.  I managed to forget that multiple MSRs are packed into a byte.  Hrm, which
means our nSVM test is even more worthless than I thought.  I'll see if I can get
it to detect this bug.