For hardware-protected VMs like SEV-SNP guests, certain conditions like
attempting to perform a write to a page which is not in the state that
the guest expects it to be in can result in a nested/extended #PF which
can only be satisfied by the host performing an implicit page state
change to transition the page into the expected shared/private state.
This is generally handled by generating a KVM_EXIT_MEMORY_FAULT event
that gets forwarded to userspace to handle via
KVM_SET_MEMORY_ATTRIBUTES.
However, the fast_page_fault() code might misconstrue this situation as
being the result of a write-protected access, and treat it as a spurious
case when it sees that writes are already allowed for the sPTE. This
results in the KVM MMU trying to resume the guest rather than taking any
action to satisfy the real source of the #PF such as generating a
KVM_EXIT_MEMORY_FAULT, resulting in the guest spinning on nested #PFs.
For now, just skip the fast path for hardware-protected VMs since they
don't currently utilize any of this access-tracking machinery anyway. In
the future, these considerations will need to be taken into account if
there's any need/desire to re-enable the fast path for
hardware-protected VMs.
Since software-protected VMs don't have a notion of a shared vs. private
that's separate from what KVM is tracking, the above
KVM_EXIT_MEMORY_FAULT condition wouldn't occur, so avoid the special
handling for that case for now.
Cc: Isaku Yamahata <isaku.yamahata@intel.com>
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
arch/x86/kvm/mmu/mmu.c | 30 ++++++++++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 62ad38b2a8c9..cecd8360378f 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3296,7 +3296,7 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
return RET_PF_CONTINUE;
}
-static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
+static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
{
/*
* Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
@@ -3307,6 +3307,32 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
if (fault->rsvd)
return false;
+ /*
+ * For hardware-protected VMs, certain conditions like attempting to
+ * perform a write to a page which is not in the state that the guest
+ * expects it to be in can result in a nested/extended #PF. In this
+ * case, the below code might misconstrue this situation as being the
+ * result of a write-protected access, and treat it as a spurious case
+ * rather than taking any action to satisfy the real source of the #PF
+ * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the
+ * guest spinning on a #PF indefinitely.
+ *
+ * For now, just skip the fast path for hardware-protected VMs since
+ * they don't currently utilize any of this machinery anyway. In the
+ * future, these considerations will need to be taken into account if
+ * there's any need/desire to re-enable the fast path for
+ * hardware-protected VMs.
+ *
+ * Since software-protected VMs don't have a notion of a shared vs.
+ * private that's separate from what KVM is tracking, the above
+ * KVM_EXIT_MEMORY_FAULT condition wouldn't occur, so avoid the
+ * special handling for that case for now.
+ */
+ if (kvm_slot_can_be_private(fault->slot) &&
+ !(IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) &&
+ vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM))
+ return false;
+
/*
* #PF can be fast if:
*
@@ -3407,7 +3433,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
u64 *sptep;
uint retry_count = 0;
- if (!page_fault_can_be_fast(fault))
+ if (!page_fault_can_be_fast(vcpu, fault))
return ret;
walk_shadow_page_lockless_begin(vcpu);
--
2.25.1
On Thu, May 09, 2024, Michael Roth wrote:
> ---
> arch/x86/kvm/mmu/mmu.c | 30 ++++++++++++++++++++++++++++--
> 1 file changed, 28 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index 62ad38b2a8c9..cecd8360378f 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3296,7 +3296,7 @@ static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
> return RET_PF_CONTINUE;
> }
>
> -static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
> +static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
> {
> /*
> * Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
> @@ -3307,6 +3307,32 @@ static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
> if (fault->rsvd)
> return false;
>
> + /*
> + * For hardware-protected VMs, certain conditions like attempting to
> + * perform a write to a page which is not in the state that the guest
> + * expects it to be in can result in a nested/extended #PF. In this
> + * case, the below code might misconstrue this situation as being the
> + * result of a write-protected access, and treat it as a spurious case
> + * rather than taking any action to satisfy the real source of the #PF
> + * such as generating a KVM_EXIT_MEMORY_FAULT. This can lead to the
> + * guest spinning on a #PF indefinitely.
> + *
> + * For now, just skip the fast path for hardware-protected VMs since
> + * they don't currently utilize any of this machinery anyway. In the
> + * future, these considerations will need to be taken into account if
> + * there's any need/desire to re-enable the fast path for
> + * hardware-protected VMs.
> + *
> + * Since software-protected VMs don't have a notion of a shared vs.
> + * private that's separate from what KVM is tracking, the above
> + * KVM_EXIT_MEMORY_FAULT condition wouldn't occur, so avoid the
> + * special handling for that case for now.
Very technically, it can occur if userspace _just_ modified the attributes. And
as I've said multiple times, at least for now, I want to avoid special casing
SW-protected VMs unless it is *absolutely* necessary, because their sole purpose
is to allow testing flows that are impossible to excercise without SNP/TDX hardware.
> + */
> + if (kvm_slot_can_be_private(fault->slot) &&
> + !(IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) &&
> + vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM))
Heh, !(x && y) kills me, I misread this like 4 times.
Anyways, I don't like the heuristic. It doesn't tie the restriction back to the
cause in any reasonable way. Can't this simply be?
if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn)
return false;
Which is much, much more self-explanatory.
On Fri, May 10, 2024 at 3:47 PM Sean Christopherson <seanjc@google.com> wrote: > > > + * Since software-protected VMs don't have a notion of a shared vs. > > + * private that's separate from what KVM is tracking, the above > > + * KVM_EXIT_MEMORY_FAULT condition wouldn't occur, so avoid the > > + * special handling for that case for now. > > Very technically, it can occur if userspace _just_ modified the attributes. And > as I've said multiple times, at least for now, I want to avoid special casing > SW-protected VMs unless it is *absolutely* necessary, because their sole purpose > is to allow testing flows that are impossible to excercise without SNP/TDX hardware. Yep, it is not like they have to be optimized. > > + */ > > + if (kvm_slot_can_be_private(fault->slot) && > > + !(IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && > > + vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM)) > > Heh, !(x && y) kills me, I misread this like 4 times. > > Anyways, I don't like the heuristic. It doesn't tie the restriction back to the > cause in any reasonable way. Can't this simply be? > > if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn) > return false; You beat me to it by seconds. And it can also be guarded by a check on kvm->arch.has_private_mem to avoid the attributes lookup. > Which is much, much more self-explanatory. Both more self-explanatory and more correct. Paolo
On Fri, May 10, 2024 at 03:50:26PM +0200, Paolo Bonzini wrote: > On Fri, May 10, 2024 at 3:47 PM Sean Christopherson <seanjc@google.com> wrote: > > > > > + * Since software-protected VMs don't have a notion of a shared vs. > > > + * private that's separate from what KVM is tracking, the above > > > + * KVM_EXIT_MEMORY_FAULT condition wouldn't occur, so avoid the > > > + * special handling for that case for now. > > > > Very technically, it can occur if userspace _just_ modified the attributes. And > > as I've said multiple times, at least for now, I want to avoid special casing > > SW-protected VMs unless it is *absolutely* necessary, because their sole purpose > > is to allow testing flows that are impossible to excercise without SNP/TDX hardware. > > Yep, it is not like they have to be optimized. Ok, I thought there were maybe some future plans to use sw-protected VMs to get some added protections from userspace. But even then there'd probably still be extra considerations for how to handle access tracking so white-listing them probably isn't right anyway. I was also partly tempted to take this route because it would cover this TDX patch as well: https://lore.kernel.org/lkml/91c797997b57056224571e22362321a23947172f.1705965635.git.isaku.yamahata@intel.com/ and avoid any weirdness about checking kvm_mem_is_private() without checking mmu_invalidate_seq, but I think those cases all end up resolving themselves eventually and added some comments around that. > > > > + */ > > > + if (kvm_slot_can_be_private(fault->slot) && > > > + !(IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && > > > + vcpu->kvm->arch.vm_type == KVM_X86_SW_PROTECTED_VM)) > > > > Heh, !(x && y) kills me, I misread this like 4 times. > > > > Anyways, I don't like the heuristic. It doesn't tie the restriction back to the > > cause in any reasonable way. Can't this simply be? > > > > if (fault->is_private != kvm_mem_is_private(vcpu->kvm, fault->gfn) > > return false; > > You beat me to it by seconds. And it can also be guarded by a check on > kvm->arch.has_private_mem to avoid the attributes lookup. I re-tested with things implemented this way and everything seems to look good. It's not clear to me whether this would cover the cases the above-mentioned TDX patch handles, but no biggie if that's still needed. The new version of the patch is here: https://github.com/mdroth/linux/commit/39643f9f6da6265d39d633a703c53997985c1208 And I've updated my branches with to replace the old patch and also incorporate Sean's suggestions for patch 22: https://github.com/mdroth/linux/commits/snp-host-v15c3-unsquashed and have them here with things already squashed in/relocated: https://github.com/mdroth/linux/commits/snp-host-v15c3 Thanks for the feedback Sean, Paolo. -Mike > > > Which is much, much more self-explanatory. > > Both more self-explanatory and more correct. > > Paolo > >
On Fri, May 10, 2024, Michael Roth wrote: > On Fri, May 10, 2024 at 03:50:26PM +0200, Paolo Bonzini wrote: > > On Fri, May 10, 2024 at 3:47 PM Sean Christopherson <seanjc@google.com> wrote: > > > > > > > + * Since software-protected VMs don't have a notion of a shared vs. > > > > + * private that's separate from what KVM is tracking, the above > > > > + * KVM_EXIT_MEMORY_FAULT condition wouldn't occur, so avoid the > > > > + * special handling for that case for now. > > > > > > Very technically, it can occur if userspace _just_ modified the attributes. And > > > as I've said multiple times, at least for now, I want to avoid special casing > > > SW-protected VMs unless it is *absolutely* necessary, because their sole purpose > > > is to allow testing flows that are impossible to excercise without SNP/TDX hardware. > > > > Yep, it is not like they have to be optimized. > > Ok, I thought there were maybe some future plans to use sw-protected VMs > to get some added protections from userspace. But even then there'd > probably still be extra considerations for how to handle access tracking > so white-listing them probably isn't right anyway. > > I was also partly tempted to take this route because it would cover this > TDX patch as well: > > https://lore.kernel.org/lkml/91c797997b57056224571e22362321a23947172f.1705965635.git.isaku.yamahata@intel.com/ Hmm, I'm pretty sure that patch is trying to fix the exact same issue you are fixing, just in a less precise way. S-EPT entries only support RWX=0 and RWX=111b, i.e. it should be impossible to have a write-fault to a present S-EPT entry. And if TDX is running afoul of this code: if (!fault->present) return !kvm_ad_enabled(); then KVM should do the sane thing and require A/D support be enabled for TDX. And if it's something else entirely, that changelog has some explaining to do. > and avoid any weirdness about checking kvm_mem_is_private() without > checking mmu_invalidate_seq, but I think those cases all end up > resolving themselves eventually and added some comments around that. Yep, checking state that is protected by mmu_invalidate_seq outside of mmu_lock is definitely allowed, e.g. the entire fast page fault path operates outside of mmu_lock and thus outside of mmu_invalidate_seq's purview. It's a-ok because the SPTE are done with an atomic CMPXCHG, and so KVM only needs to ensure its page tables aren't outright _freed_. If the zap triggered by the attributes change "wins", then the fast #PF path will fail the CMPXCHG and be an expensive NOP. If the fast #PF wins, the zap will pave over the fast #PF fix, and the IPI+flush that is needed for all zaps, to ensure vCPUs don't have stale references, does the rest. And if there's an attributes race that causes the fast #PF to bail early, the vCPU will see the correct state on the next page fault.
On Fri, May 10, 2024 at 08:59:09AM -0700, Sean Christopherson <seanjc@google.com> wrote: > On Fri, May 10, 2024, Michael Roth wrote: > > On Fri, May 10, 2024 at 03:50:26PM +0200, Paolo Bonzini wrote: > > > On Fri, May 10, 2024 at 3:47 PM Sean Christopherson <seanjc@google.com> wrote: > > > > > > > > > + * Since software-protected VMs don't have a notion of a shared vs. > > > > > + * private that's separate from what KVM is tracking, the above > > > > > + * KVM_EXIT_MEMORY_FAULT condition wouldn't occur, so avoid the > > > > > + * special handling for that case for now. > > > > > > > > Very technically, it can occur if userspace _just_ modified the attributes. And > > > > as I've said multiple times, at least for now, I want to avoid special casing > > > > SW-protected VMs unless it is *absolutely* necessary, because their sole purpose > > > > is to allow testing flows that are impossible to excercise without SNP/TDX hardware. > > > > > > Yep, it is not like they have to be optimized. > > > > Ok, I thought there were maybe some future plans to use sw-protected VMs > > to get some added protections from userspace. But even then there'd > > probably still be extra considerations for how to handle access tracking > > so white-listing them probably isn't right anyway. > > > > I was also partly tempted to take this route because it would cover this > > TDX patch as well: > > > > https://lore.kernel.org/lkml/91c797997b57056224571e22362321a23947172f.1705965635.git.isaku.yamahata@intel.com/ > > Hmm, I'm pretty sure that patch is trying to fix the exact same issue you are > fixing, just in a less precise way. S-EPT entries only support RWX=0 and RWX=111b, > i.e. it should be impossible to have a write-fault to a present S-EPT entry. > > And if TDX is running afoul of this code: > > if (!fault->present) > return !kvm_ad_enabled(); > > then KVM should do the sane thing and require A/D support be enabled for TDX. > > And if it's something else entirely, that changelog has some explaining to do. Yes, it's for KVM_EXIT_MEMORY_FAULT case. Because Secure-EPT has non-present or all RWX allowed, fast page fault always returns RET_PF_INVALID by is_shadow_present_pte() check. I lightly tested the patch at [1] and it works for TDX KVM. [1] https://github.com/mdroth/linux/commit/39643f9f6da6265d39d633a703c53997985c1208 Just in case for that patch, Reviewed-by: Isaku Yamahata <isaku.yamahata@intel.com> -- Isaku Yamahata <isaku.yamahata@intel.com>
The intended logic when handling #NPFs with the RMP bit set (31) is to
first check to see if the #NPF requires a shared<->private transition
and, if so, to go ahead and let the corresponding KVM_EXIT_MEMORY_FAULT
get forwarded on to userspace before proceeding with any handling of
other potential RMP fault conditions like needing to PSMASH the RMP
entry/etc (which will be done later if the guest still re-faults after
the KVM_EXIT_MEMORY_FAULT is processed by userspace).
The determination of whether any userspace handling of
KVM_EXIT_MEMORY_FAULT is needed is done by interpreting the return code
of kvm_mmu_page_fault(). However, the current code misinterprets the
return code, expecting 0 to indicate a userspace exit rather than less
than 0 (-EFAULT). This leads to the following unexpected behavior:
- for KVM_EXIT_MEMORY_FAULTs resulting for implicit shared->private
conversions, warnings get printed from sev_handle_rmp_fault()
because it does not expect to be called for GPAs where
KVM_MEMORY_ATTRIBUTE_PRIVATE is not set. Standard linux guests don't
generally do this, but it is allowed and should be handled
similarly to private->shared conversions rather than triggering any
sort of warnings
- if gmem support for 2MB folios is enabled (via currently out-of-tree
code), implicit shared<->private conversions will always result in
a PSMASH being attempted, even if it's not actually needed to
resolve the RMP fault. This doesn't cause any harm, but results in a
needless PSMASH and zapping of the sPTE
Resolve these issues by calling sev_handle_rmp_fault() only when
kvm_mmu_page_fault()'s return code is greater than or equal to 0,
indicating a KVM_MEMORY_EXIT_FAULT/-EFAULT isn't needed. While here,
simplify the code slightly and fix up the associated comments for better
clarity.
Fixes: ccc9d836c5c3 ("KVM: SEV: Add support to handle RMP nested page faults")
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
arch/x86/kvm/svm/svm.c | 10 ++++------
1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 426ad49325d7..9431ce74c7d4 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2070,14 +2070,12 @@ static int npf_interception(struct kvm_vcpu *vcpu)
svm->vmcb->control.insn_len);
/*
- * rc == 0 indicates a userspace exit is needed to handle page
- * transitions, so do that first before updating the RMP table.
+ * rc < 0 indicates a userspace exit may be needed to handle page
+ * attribute updates, so deal with that first before handling other
+ * potential RMP fault conditions.
*/
- if (error_code & PFERR_GUEST_RMP_MASK) {
- if (rc == 0)
- return rc;
+ if (rc >= 0 && error_code & PFERR_GUEST_RMP_MASK)
sev_handle_rmp_fault(vcpu, fault_address, error_code);
- }
return rc;
}
--
2.25.1
On Thu, May 09, 2024, Michael Roth wrote:
> The intended logic when handling #NPFs with the RMP bit set (31) is to
> first check to see if the #NPF requires a shared<->private transition
> and, if so, to go ahead and let the corresponding KVM_EXIT_MEMORY_FAULT
> get forwarded on to userspace before proceeding with any handling of
> other potential RMP fault conditions like needing to PSMASH the RMP
> entry/etc (which will be done later if the guest still re-faults after
> the KVM_EXIT_MEMORY_FAULT is processed by userspace).
>
> The determination of whether any userspace handling of
> KVM_EXIT_MEMORY_FAULT is needed is done by interpreting the return code
> of kvm_mmu_page_fault(). However, the current code misinterprets the
> return code, expecting 0 to indicate a userspace exit rather than less
> than 0 (-EFAULT). This leads to the following unexpected behavior:
>
> - for KVM_EXIT_MEMORY_FAULTs resulting for implicit shared->private
> conversions, warnings get printed from sev_handle_rmp_fault()
> because it does not expect to be called for GPAs where
> KVM_MEMORY_ATTRIBUTE_PRIVATE is not set. Standard linux guests don't
> generally do this, but it is allowed and should be handled
> similarly to private->shared conversions rather than triggering any
> sort of warnings
>
> - if gmem support for 2MB folios is enabled (via currently out-of-tree
> code), implicit shared<->private conversions will always result in
> a PSMASH being attempted, even if it's not actually needed to
> resolve the RMP fault. This doesn't cause any harm, but results in a
> needless PSMASH and zapping of the sPTE
>
> Resolve these issues by calling sev_handle_rmp_fault() only when
> kvm_mmu_page_fault()'s return code is greater than or equal to 0,
> indicating a KVM_MEMORY_EXIT_FAULT/-EFAULT isn't needed. While here,
> simplify the code slightly and fix up the associated comments for better
> clarity.
>
> Fixes: ccc9d836c5c3 ("KVM: SEV: Add support to handle RMP nested page faults")
>
> Signed-off-by: Michael Roth <michael.roth@amd.com>
> ---
> arch/x86/kvm/svm/svm.c | 10 ++++------
> 1 file changed, 4 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 426ad49325d7..9431ce74c7d4 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -2070,14 +2070,12 @@ static int npf_interception(struct kvm_vcpu *vcpu)
> svm->vmcb->control.insn_len);
>
> /*
> - * rc == 0 indicates a userspace exit is needed to handle page
> - * transitions, so do that first before updating the RMP table.
> + * rc < 0 indicates a userspace exit may be needed to handle page
> + * attribute updates, so deal with that first before handling other
> + * potential RMP fault conditions.
> */
> - if (error_code & PFERR_GUEST_RMP_MASK) {
> - if (rc == 0)
> - return rc;
> + if (rc >= 0 && error_code & PFERR_GUEST_RMP_MASK)
This isn't correct either. A return of '0' also indiciates "exit to userspace",
it just doesn't happen with SNP because '0' is returned only when KVM attempts
emulation, and that too gets short-circuited by svm_check_emulate_instruction().
And I would honestly drop the comment, KVM's less-than-pleasant 1/0/-errno return
values overload is ubiquitous enough that it should be relatively self-explanatory.
Or if you prefer to keep a comment, drop the part that specifically calls out
attributes updates, because that incorrectly implies that's the _only_ reason
why KVM checks the return. But my vote is to drop the comment, because it
essentially becomes "don't proceed to step 2 if step 1 failed", which kind of
makes the reader go "well, yeah".
On 5/10/24 15:58, Sean Christopherson wrote:
> On Thu, May 09, 2024, Michael Roth wrote:
>> The intended logic when handling #NPFs with the RMP bit set (31) is to
>> first check to see if the #NPF requires a shared<->private transition
>> and, if so, to go ahead and let the corresponding KVM_EXIT_MEMORY_FAULT
>> get forwarded on to userspace before proceeding with any handling of
>> other potential RMP fault conditions like needing to PSMASH the RMP
>> entry/etc (which will be done later if the guest still re-faults after
>> the KVM_EXIT_MEMORY_FAULT is processed by userspace).
>>
>> The determination of whether any userspace handling of
>> KVM_EXIT_MEMORY_FAULT is needed is done by interpreting the return code
>> of kvm_mmu_page_fault(). However, the current code misinterprets the
>> return code, expecting 0 to indicate a userspace exit rather than less
>> than 0 (-EFAULT). This leads to the following unexpected behavior:
>>
>> - for KVM_EXIT_MEMORY_FAULTs resulting for implicit shared->private
>> conversions, warnings get printed from sev_handle_rmp_fault()
>> because it does not expect to be called for GPAs where
>> KVM_MEMORY_ATTRIBUTE_PRIVATE is not set. Standard linux guests don't
>> generally do this, but it is allowed and should be handled
>> similarly to private->shared conversions rather than triggering any
>> sort of warnings
>>
>> - if gmem support for 2MB folios is enabled (via currently out-of-tree
>> code), implicit shared<->private conversions will always result in
>> a PSMASH being attempted, even if it's not actually needed to
>> resolve the RMP fault. This doesn't cause any harm, but results in a
>> needless PSMASH and zapping of the sPTE
>>
>> Resolve these issues by calling sev_handle_rmp_fault() only when
>> kvm_mmu_page_fault()'s return code is greater than or equal to 0,
>> indicating a KVM_MEMORY_EXIT_FAULT/-EFAULT isn't needed. While here,
>> simplify the code slightly and fix up the associated comments for better
>> clarity.
>>
>> Fixes: ccc9d836c5c3 ("KVM: SEV: Add support to handle RMP nested page faults")
>>
>> Signed-off-by: Michael Roth <michael.roth@amd.com>
>> ---
>> arch/x86/kvm/svm/svm.c | 10 ++++------
>> 1 file changed, 4 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index 426ad49325d7..9431ce74c7d4 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -2070,14 +2070,12 @@ static int npf_interception(struct kvm_vcpu *vcpu)
>> svm->vmcb->control.insn_len);
>>
>> /*
>> - * rc == 0 indicates a userspace exit is needed to handle page
>> - * transitions, so do that first before updating the RMP table.
>> + * rc < 0 indicates a userspace exit may be needed to handle page
>> + * attribute updates, so deal with that first before handling other
>> + * potential RMP fault conditions.
>> */
>> - if (error_code & PFERR_GUEST_RMP_MASK) {
>> - if (rc == 0)
>> - return rc;
>> + if (rc >= 0 && error_code & PFERR_GUEST_RMP_MASK)
>
> This isn't correct either. A return of '0' also indiciates "exit to userspace",
> it just doesn't happen with SNP because '0' is returned only when KVM attempts
> emulation, and that too gets short-circuited by svm_check_emulate_instruction().
>
> And I would honestly drop the comment, KVM's less-than-pleasant 1/0/-errno return
> values overload is ubiquitous enough that it should be relatively self-explanatory.
>
> Or if you prefer to keep a comment, drop the part that specifically calls out
> attributes updates, because that incorrectly implies that's the _only_ reason
> why KVM checks the return. But my vote is to drop the comment, because it
> essentially becomes "don't proceed to step 2 if step 1 failed", which kind of
> makes the reader go "well, yeah".
So IIUC you're suggesting
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 426ad49325d7..c39eaeb21981 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -2068,16 +2068,11 @@ static int npf_interception(struct kvm_vcpu *vcpu)
static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
svm->vmcb->control.insn_bytes : NULL,
svm->vmcb->control.insn_len);
+ if (rc <= 0)
+ return rc;
- /*
- * rc == 0 indicates a userspace exit is needed to handle page
- * transitions, so do that first before updating the RMP table.
- */
- if (error_code & PFERR_GUEST_RMP_MASK) {
- if (rc == 0)
- return rc;
+ if (error_code & PFERR_GUEST_RMP_MASK)
sev_handle_rmp_fault(vcpu, fault_address, error_code);
- }
return rc;
}
?
So, we're... a bit tight for 6.10 to include SNP and that is an
understatement. My plan is to merge it for 6.11, but do so
immediately after the merge window ends. In other words, it
is a delay in terms of release but not in terms of time. I
don't want QEMU and kvm-unit-tests work to be delayed any
further, in particular.
Once we sort out the loose ends of patches 21-23, you could send
it as a pull request.
Paolo
On Fri, May 10, 2024 at 06:01:52PM +0200, Paolo Bonzini wrote:
> On 5/10/24 15:58, Sean Christopherson wrote:
> > On Thu, May 09, 2024, Michael Roth wrote:
> > > The intended logic when handling #NPFs with the RMP bit set (31) is to
> > > first check to see if the #NPF requires a shared<->private transition
> > > and, if so, to go ahead and let the corresponding KVM_EXIT_MEMORY_FAULT
> > > get forwarded on to userspace before proceeding with any handling of
> > > other potential RMP fault conditions like needing to PSMASH the RMP
> > > entry/etc (which will be done later if the guest still re-faults after
> > > the KVM_EXIT_MEMORY_FAULT is processed by userspace).
> > >
> > > The determination of whether any userspace handling of
> > > KVM_EXIT_MEMORY_FAULT is needed is done by interpreting the return code
> > > of kvm_mmu_page_fault(). However, the current code misinterprets the
> > > return code, expecting 0 to indicate a userspace exit rather than less
> > > than 0 (-EFAULT). This leads to the following unexpected behavior:
> > >
> > > - for KVM_EXIT_MEMORY_FAULTs resulting for implicit shared->private
> > > conversions, warnings get printed from sev_handle_rmp_fault()
> > > because it does not expect to be called for GPAs where
> > > KVM_MEMORY_ATTRIBUTE_PRIVATE is not set. Standard linux guests don't
> > > generally do this, but it is allowed and should be handled
> > > similarly to private->shared conversions rather than triggering any
> > > sort of warnings
> > >
> > > - if gmem support for 2MB folios is enabled (via currently out-of-tree
> > > code), implicit shared<->private conversions will always result in
> > > a PSMASH being attempted, even if it's not actually needed to
> > > resolve the RMP fault. This doesn't cause any harm, but results in a
> > > needless PSMASH and zapping of the sPTE
> > >
> > > Resolve these issues by calling sev_handle_rmp_fault() only when
> > > kvm_mmu_page_fault()'s return code is greater than or equal to 0,
> > > indicating a KVM_MEMORY_EXIT_FAULT/-EFAULT isn't needed. While here,
> > > simplify the code slightly and fix up the associated comments for better
> > > clarity.
> > >
> > > Fixes: ccc9d836c5c3 ("KVM: SEV: Add support to handle RMP nested page faults")
> > >
> > > Signed-off-by: Michael Roth <michael.roth@amd.com>
> > > ---
> > > arch/x86/kvm/svm/svm.c | 10 ++++------
> > > 1 file changed, 4 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > > index 426ad49325d7..9431ce74c7d4 100644
> > > --- a/arch/x86/kvm/svm/svm.c
> > > +++ b/arch/x86/kvm/svm/svm.c
> > > @@ -2070,14 +2070,12 @@ static int npf_interception(struct kvm_vcpu *vcpu)
> > > svm->vmcb->control.insn_len);
> > > /*
> > > - * rc == 0 indicates a userspace exit is needed to handle page
> > > - * transitions, so do that first before updating the RMP table.
> > > + * rc < 0 indicates a userspace exit may be needed to handle page
> > > + * attribute updates, so deal with that first before handling other
> > > + * potential RMP fault conditions.
> > > */
> > > - if (error_code & PFERR_GUEST_RMP_MASK) {
> > > - if (rc == 0)
> > > - return rc;
> > > + if (rc >= 0 && error_code & PFERR_GUEST_RMP_MASK)
> >
> > This isn't correct either. A return of '0' also indiciates "exit to userspace",
> > it just doesn't happen with SNP because '0' is returned only when KVM attempts
> > emulation, and that too gets short-circuited by svm_check_emulate_instruction().
> >
> > And I would honestly drop the comment, KVM's less-than-pleasant 1/0/-errno return
> > values overload is ubiquitous enough that it should be relatively self-explanatory.
> >
> > Or if you prefer to keep a comment, drop the part that specifically calls out
> > attributes updates, because that incorrectly implies that's the _only_ reason
> > why KVM checks the return. But my vote is to drop the comment, because it
> > essentially becomes "don't proceed to step 2 if step 1 failed", which kind of
> > makes the reader go "well, yeah".
>
> So IIUC you're suggesting
>
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index 426ad49325d7..c39eaeb21981 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -2068,16 +2068,11 @@ static int npf_interception(struct kvm_vcpu *vcpu)
> static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
> svm->vmcb->control.insn_bytes : NULL,
> svm->vmcb->control.insn_len);
> + if (rc <= 0)
> + return rc;
> - /*
> - * rc == 0 indicates a userspace exit is needed to handle page
> - * transitions, so do that first before updating the RMP table.
> - */
> - if (error_code & PFERR_GUEST_RMP_MASK) {
> - if (rc == 0)
> - return rc;
> + if (error_code & PFERR_GUEST_RMP_MASK)
> sev_handle_rmp_fault(vcpu, fault_address, error_code);
> - }
> return rc;
> }
>
> ?
>
> So, we're... a bit tight for 6.10 to include SNP and that is an
> understatement. My plan is to merge it for 6.11, but do so
> immediately after the merge window ends. In other words, it
> is a delay in terms of release but not in terms of time. I
> don't want QEMU and kvm-unit-tests work to be delayed any
> further, in particular.
That's unfortunate, I'd thought from the PUCK call that we still had
some time to stabilize things before merge window. But whatever you
think is best.
>
> Once we sort out the loose ends of patches 21-23, you could send
> it as a pull request.
Ok, as a pull request against kvm/next, or kvm/queue?
Thanks,
Mike
>
> Paolo
>
>
On 5/10/24 18:37, Michael Roth wrote: >> So, we're... a bit tight for 6.10 to include SNP and that is an >> understatement. My plan is to merge it for 6.11, but do so >> immediately after the merge window ends. In other words, it >> is a delay in terms of release but not in terms of time. I >> don't want QEMU and kvm-unit-tests work to be delayed any >> further, in particular. > > That's unfortunate, I'd thought from the PUCK call that we still had > some time to stabilize things before merge window. But whatever you > think is best. Well, the merge window starts next sunday, doesn't it? If there's an -rc8 I agree there's some leeway, but that is not too likely. >> Once we sort out the loose ends of patches 21-23, you could send >> it as a pull request. > Ok, as a pull request against kvm/next, or kvm/queue? Against kvm/next. Paolo
On May 10, 2024 6:59:37 PM GMT+02:00, Paolo Bonzini <pbonzini@redhat.com> wrote: >Well, the merge window starts next sunday, doesn't it? If there's an -rc8 I agree there's some leeway, but that is not too likely. Nah, the merge window just opened yesterday. -- Sent from a small device: formatting sucks and brevity is inevitable.
On Fri, May 10, 2024 at 6:59 PM Paolo Bonzini <pbonzini@redhat.com> wrote: > Well, the merge window starts next sunday, doesn't it? If there's an > -rc8 I agree there's some leeway, but that is not too likely. > > >> Once we sort out the loose ends of patches 21-23, you could send > >> it as a pull request. > > Ok, as a pull request against kvm/next, or kvm/queue? > > Against kvm/next. Ah no, only kvm/queue has the preparatory hooks - they make no sense without something that uses them. kvm/queue is ready now. Also, please send the pull request "QEMU style", i.e. with patches as replies. If there's an -rc8, I'll probably pull it on Thursday morning. Paolo
On Fri, May 10, 2024 at 06:58:45AM -0700, Sean Christopherson wrote:
> On Thu, May 09, 2024, Michael Roth wrote:
> > The intended logic when handling #NPFs with the RMP bit set (31) is to
> > first check to see if the #NPF requires a shared<->private transition
> > and, if so, to go ahead and let the corresponding KVM_EXIT_MEMORY_FAULT
> > get forwarded on to userspace before proceeding with any handling of
> > other potential RMP fault conditions like needing to PSMASH the RMP
> > entry/etc (which will be done later if the guest still re-faults after
> > the KVM_EXIT_MEMORY_FAULT is processed by userspace).
> >
> > The determination of whether any userspace handling of
> > KVM_EXIT_MEMORY_FAULT is needed is done by interpreting the return code
> > of kvm_mmu_page_fault(). However, the current code misinterprets the
> > return code, expecting 0 to indicate a userspace exit rather than less
> > than 0 (-EFAULT). This leads to the following unexpected behavior:
> >
> > - for KVM_EXIT_MEMORY_FAULTs resulting for implicit shared->private
> > conversions, warnings get printed from sev_handle_rmp_fault()
> > because it does not expect to be called for GPAs where
> > KVM_MEMORY_ATTRIBUTE_PRIVATE is not set. Standard linux guests don't
> > generally do this, but it is allowed and should be handled
> > similarly to private->shared conversions rather than triggering any
> > sort of warnings
> >
> > - if gmem support for 2MB folios is enabled (via currently out-of-tree
> > code), implicit shared<->private conversions will always result in
> > a PSMASH being attempted, even if it's not actually needed to
> > resolve the RMP fault. This doesn't cause any harm, but results in a
> > needless PSMASH and zapping of the sPTE
> >
> > Resolve these issues by calling sev_handle_rmp_fault() only when
> > kvm_mmu_page_fault()'s return code is greater than or equal to 0,
> > indicating a KVM_MEMORY_EXIT_FAULT/-EFAULT isn't needed. While here,
> > simplify the code slightly and fix up the associated comments for better
> > clarity.
> >
> > Fixes: ccc9d836c5c3 ("KVM: SEV: Add support to handle RMP nested page faults")
> >
> > Signed-off-by: Michael Roth <michael.roth@amd.com>
> > ---
> > arch/x86/kvm/svm/svm.c | 10 ++++------
> > 1 file changed, 4 insertions(+), 6 deletions(-)
> >
> > diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> > index 426ad49325d7..9431ce74c7d4 100644
> > --- a/arch/x86/kvm/svm/svm.c
> > +++ b/arch/x86/kvm/svm/svm.c
> > @@ -2070,14 +2070,12 @@ static int npf_interception(struct kvm_vcpu *vcpu)
> > svm->vmcb->control.insn_len);
> >
> > /*
> > - * rc == 0 indicates a userspace exit is needed to handle page
> > - * transitions, so do that first before updating the RMP table.
> > + * rc < 0 indicates a userspace exit may be needed to handle page
> > + * attribute updates, so deal with that first before handling other
> > + * potential RMP fault conditions.
> > */
> > - if (error_code & PFERR_GUEST_RMP_MASK) {
> > - if (rc == 0)
> > - return rc;
> > + if (rc >= 0 && error_code & PFERR_GUEST_RMP_MASK)
>
> This isn't correct either. A return of '0' also indiciates "exit to userspace",
> it just doesn't happen with SNP because '0' is returned only when KVM attempts
> emulation, and that too gets short-circuited by svm_check_emulate_instruction().
>
> And I would honestly drop the comment, KVM's less-than-pleasant 1/0/-errno return
> values overload is ubiquitous enough that it should be relatively self-explanatory.
>
> Or if you prefer to keep a comment, drop the part that specifically calls out
> attributes updates, because that incorrectly implies that's the _only_ reason
> why KVM checks the return. But my vote is to drop the comment, because it
> essentially becomes "don't proceed to step 2 if step 1 failed", which kind of
> makes the reader go "well, yeah".
Ok, I think I was just paranoid after missing this. I've gone ahead and
dropped the comment, and hopefully it's now drilled into my head enough
that it's obvious to me now as well :) I've also changed the logic to
skip the extra RMP handling for rc==0 as well (should that ever arise
for any future reason):
https://github.com/mdroth/linux/commit/0a0ba0d7f7571a31f0abc68acc51f24c2a14a8cf
Thanks!
-Mike
There are a few edge-cases that the current processing for GHCB PSC
requests doesn't handle properly:
- KVM properly ignores SMASH/UNSMASH ops when they are embedded in a
PSC request buffer which contains other PSC operations, but
inadvertantly forwards them to userspace as private->shared PSC
requests if they appear at the end of the buffer. Make sure these are
ignored instead, just like cases where they are not at the end of the
request buffer.
- Current code handles non-zero 'cur_page' fields when they are at the
beginning of a new GPA range, but it is not handling properly when
iterating through subsequent entries which are otherwise part of a
contiguous range. Fix up the handling so that these entries are not
combined into a larger contiguous range that include unintended GPA
ranges and are instead processed later as the start of a new
contiguous range.
- The page size variable used to track 2M entries in KVM for inflight PSCs
might be artifically set to a different value, which can lead to
unexpected values in the entry's final 'cur_page' update. Use the
entry's 'pagesize' field instead to determine what the value of
'cur_page' should be upon completion of processing.
While here, also add a small helper for clearing in-flight PSCs
variables and fix up comments for better readability.
Fixes: 266205d810d2 ("KVM: SEV: Add support to handle Page State Change VMGEXIT")
Signed-off-by: Michael Roth <michael.roth@amd.com>
---
arch/x86/kvm/svm/sev.c | 73 +++++++++++++++++++++++++++---------------
1 file changed, 47 insertions(+), 26 deletions(-)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 35f0bd91f92e..ab23329e2bd0 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3555,43 +3555,50 @@ struct psc_buffer {
static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc);
-static int snp_complete_psc(struct kvm_vcpu *vcpu)
+static void snp_reset_inflight_psc(struct vcpu_svm *svm)
+{
+ svm->sev_es.psc_idx = 0;
+ svm->sev_es.psc_inflight = 0;
+ svm->sev_es.psc_2m = false;
+}
+
+static void __snp_complete_psc(struct vcpu_svm *svm)
{
- struct vcpu_svm *svm = to_svm(vcpu);
struct psc_buffer *psc = svm->sev_es.ghcb_sa;
struct psc_entry *entries = psc->entries;
struct psc_hdr *hdr = &psc->hdr;
- __u64 psc_ret;
__u16 idx;
- if (vcpu->run->hypercall.ret) {
- psc_ret = VMGEXIT_PSC_ERROR_GENERIC;
- goto out_resume;
- }
-
/*
* Everything in-flight has been processed successfully. Update the
- * corresponding entries in the guest's PSC buffer.
+ * corresponding entries in the guest's PSC buffer and zero out the
+ * count of in-flight PSC entries.
*/
for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight;
svm->sev_es.psc_inflight--, idx++) {
struct psc_entry *entry = &entries[idx];
- entry->cur_page = svm->sev_es.psc_2m ? 512 : 1;
+ entry->cur_page = entry->pagesize ? 512 : 1;
}
hdr->cur_entry = idx;
+}
- /* Handle the next range (if any). */
- return snp_begin_psc(svm, psc);
+static int snp_complete_psc(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct psc_buffer *psc = svm->sev_es.ghcb_sa;
-out_resume:
- svm->sev_es.psc_idx = 0;
- svm->sev_es.psc_inflight = 0;
- svm->sev_es.psc_2m = false;
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret);
+ if (vcpu->run->hypercall.ret) {
+ snp_reset_inflight_psc(svm);
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1; /* resume guest */
+ }
- return 1; /* resume guest */
+ __snp_complete_psc(svm);
+
+ /* Handle the next range (if any). */
+ return snp_begin_psc(svm, psc);
}
static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
@@ -3634,6 +3641,7 @@ static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
goto out_resume;
}
+next_range:
/* Find the start of the next range which needs processing. */
for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) {
__u16 cur_page;
@@ -3642,11 +3650,6 @@ static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
entry_start = entries[idx];
- /* Only private/shared conversions are currently supported. */
- if (entry_start.operation != VMGEXIT_PSC_OP_PRIVATE &&
- entry_start.operation != VMGEXIT_PSC_OP_SHARED)
- continue;
-
gfn = entry_start.gfn;
cur_page = entry_start.cur_page;
huge = entry_start.pagesize;
@@ -3687,6 +3690,7 @@ static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
if (entry.operation != entry_start.operation ||
entry.gfn != entry_start.gfn + npages ||
+ entry.cur_page != 0 ||
!!entry.pagesize != svm->sev_es.psc_2m)
break;
@@ -3694,6 +3698,25 @@ static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
npages += entry_start.pagesize ? 512 : 1;
}
+ /*
+ * Only shared/private PSC operations are currently supported, so if the
+ * entire range consists of unsupported operations (e.g. SMASH/UNSMASH),
+ * then consider the entire range completed and avoid exiting to
+ * userspace. In theory snp_complete_psc() can always be called directly
+ * at this point to complete the current range and start the next one,
+ * but that could lead to unexpected levels of recursion, so only do
+ * that if there are no more entries to process and the entire request
+ * has been completed.
+ */
+ if (entry_start.operation != VMGEXIT_PSC_OP_PRIVATE &&
+ entry_start.operation != VMGEXIT_PSC_OP_SHARED) {
+ if (idx > idx_end)
+ return snp_complete_psc(vcpu);
+
+ __snp_complete_psc(svm);
+ goto next_range;
+ }
+
vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
vcpu->run->hypercall.args[0] = gpa;
@@ -3709,9 +3732,7 @@ static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
return 0; /* forward request to userspace */
out_resume:
- svm->sev_es.psc_idx = 0;
- svm->sev_es.psc_inflight = 0;
- svm->sev_es.psc_2m = false;
+ snp_reset_inflight_psc(svm);
ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret);
return 1; /* resume guest */
--
2.25.1
On 5/10/24 03:58, Michael Roth wrote:
> There are a few edge-cases that the current processing for GHCB PSC
> requests doesn't handle properly:
>
> - KVM properly ignores SMASH/UNSMASH ops when they are embedded in a
> PSC request buffer which contains other PSC operations, but
> inadvertantly forwards them to userspace as private->shared PSC
> requests if they appear at the end of the buffer. Make sure these are
> ignored instead, just like cases where they are not at the end of the
> request buffer.
>
> - Current code handles non-zero 'cur_page' fields when they are at the
> beginning of a new GPA range, but it is not handling properly when
> iterating through subsequent entries which are otherwise part of a
> contiguous range. Fix up the handling so that these entries are not
> combined into a larger contiguous range that include unintended GPA
> ranges and are instead processed later as the start of a new
> contiguous range.
>
> - The page size variable used to track 2M entries in KVM for inflight PSCs
> might be artifically set to a different value, which can lead to
> unexpected values in the entry's final 'cur_page' update. Use the
> entry's 'pagesize' field instead to determine what the value of
> 'cur_page' should be upon completion of processing.
>
> While here, also add a small helper for clearing in-flight PSCs
> variables and fix up comments for better readability.
>
> Fixes: 266205d810d2 ("KVM: SEV: Add support to handle Page State Change VMGEXIT")
> Signed-off-by: Michael Roth <michael.roth@amd.com>
There are some more improvements that can be made to the readability of
the code... this one is already better than the patch is fixing up, but I
don't like the code that is in the loop even though it is unconditionally
followed by "break".
Here's my attempt at replacing this patch, which is really more of a
rewrite of the whole function... Untested beyond compilation.
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 35f0bd91f92e..6e612789c35f 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -3555,23 +3555,25 @@ struct psc_buffer {
static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc);
-static int snp_complete_psc(struct kvm_vcpu *vcpu)
+static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret)
+{
+ svm->sev_es.psc_inflight = 0;
+ svm->sev_es.psc_idx = 0;
+ svm->sev_es.psc_2m = 0;
+ ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret);
+}
+
+static void __snp_complete_one_psc(struct vcpu_svm *svm)
{
- struct vcpu_svm *svm = to_svm(vcpu);
struct psc_buffer *psc = svm->sev_es.ghcb_sa;
struct psc_entry *entries = psc->entries;
struct psc_hdr *hdr = &psc->hdr;
- __u64 psc_ret;
__u16 idx;
- if (vcpu->run->hypercall.ret) {
- psc_ret = VMGEXIT_PSC_ERROR_GENERIC;
- goto out_resume;
- }
-
/*
* Everything in-flight has been processed successfully. Update the
- * corresponding entries in the guest's PSC buffer.
+ * corresponding entries in the guest's PSC buffer and zero out the
+ * count of in-flight PSC entries.
*/
for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight;
svm->sev_es.psc_inflight--, idx++) {
@@ -3581,17 +3583,22 @@ static int snp_complete_psc(struct kvm_vcpu *vcpu)
}
hdr->cur_entry = idx;
+}
+
+static int snp_complete_one_psc(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct psc_buffer *psc = svm->sev_es.ghcb_sa;
+
+ if (vcpu->run->hypercall.ret) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1; /* resume guest */
+ }
+
+ __snp_complete_one_psc(svm);
/* Handle the next range (if any). */
return snp_begin_psc(svm, psc);
-
-out_resume:
- svm->sev_es.psc_idx = 0;
- svm->sev_es.psc_inflight = 0;
- svm->sev_es.psc_2m = false;
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret);
-
- return 1; /* resume guest */
}
static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
@@ -3601,18 +3608,20 @@ static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
struct psc_hdr *hdr = &psc->hdr;
struct psc_entry entry_start;
u16 idx, idx_start, idx_end;
- __u64 psc_ret, gpa;
+ u64 gfn;
int npages;
-
- /* There should be no other PSCs in-flight at this point. */
- if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) {
- psc_ret = VMGEXIT_PSC_ERROR_GENERIC;
- goto out_resume;
- }
+ bool huge;
if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) {
- psc_ret = VMGEXIT_PSC_ERROR_GENERIC;
- goto out_resume;
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1;
+ }
+
+next_range:
+ /* There should be no other PSCs in-flight at this point. */
+ if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
+ return 1;
}
/*
@@ -3624,97 +3633,99 @@ static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
idx_end = hdr->end_entry;
if (idx_end >= VMGEXIT_PSC_MAX_COUNT) {
- psc_ret = VMGEXIT_PSC_ERROR_INVALID_HDR;
- goto out_resume;
- }
-
- /* Nothing more to process. */
- if (idx_start > idx_end) {
- psc_ret = 0;
- goto out_resume;
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR);
+ return 1;
}
/* Find the start of the next range which needs processing. */
for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) {
- __u16 cur_page;
- gfn_t gfn;
- bool huge;
-
entry_start = entries[idx];
-
- /* Only private/shared conversions are currently supported. */
- if (entry_start.operation != VMGEXIT_PSC_OP_PRIVATE &&
- entry_start.operation != VMGEXIT_PSC_OP_SHARED)
- continue;
-
gfn = entry_start.gfn;
- cur_page = entry_start.cur_page;
huge = entry_start.pagesize;
+ npages = huge ? 512 : 1;
- if ((huge && (cur_page > 512 || !IS_ALIGNED(gfn, 512))) ||
- (!huge && cur_page > 1)) {
- psc_ret = VMGEXIT_PSC_ERROR_INVALID_ENTRY;
- goto out_resume;
+ if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) {
+ snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY);
+ return 1;
}
+ if (entry_start.cur_page) {
+ /*
+ * If this is a partially-completed 2M range, force 4K
+ * handling for the remaining pages since they're effectively
+ * split at this point. Subsequent code should ensure this
+ * doesn't get combined with adjacent PSC entries where 2M
+ * handling is still possible.
+ */
+ npages -= entry_start.cur_page;
+ gfn += entry_start.cur_page;
+ huge = false;
+ }
+ if (npages)
+ break;
+
/* All sub-pages already processed. */
- if ((huge && cur_page == 512) || (!huge && cur_page == 1))
- continue;
-
- /*
- * If this is a partially-completed 2M range, force 4K handling
- * for the remaining pages since they're effectively split at
- * this point. Subsequent code should ensure this doesn't get
- * combined with adjacent PSC entries where 2M handling is still
- * possible.
- */
- svm->sev_es.psc_2m = cur_page ? false : huge;
- svm->sev_es.psc_idx = idx;
- svm->sev_es.psc_inflight = 1;
-
- gpa = gfn_to_gpa(gfn + cur_page);
- npages = huge ? 512 - cur_page : 1;
- break;
}
+ if (idx > idx_end) {
+ /* Nothing more to process. */
+ snp_complete_psc(svm, 0);
+ return 1;
+ }
+
+ svm->sev_es.psc_2m = huge;
+ svm->sev_es.psc_idx = idx;
+ svm->sev_es.psc_inflight = 1;
+
/*
* Find all subsequent PSC entries that contain adjacent GPA
* ranges/operations and can be combined into a single
* KVM_HC_MAP_GPA_RANGE exit.
*/
- for (idx = svm->sev_es.psc_idx + 1; idx <= idx_end; idx++) {
+ while (++idx <= idx_end) {
struct psc_entry entry = entries[idx];
if (entry.operation != entry_start.operation ||
- entry.gfn != entry_start.gfn + npages ||
- !!entry.pagesize != svm->sev_es.psc_2m)
+ entry.gfn != gfn + npages ||
+ entry.cur_page ||
+ !!entry.pagesize != huge)
break;
svm->sev_es.psc_inflight++;
- npages += entry_start.pagesize ? 512 : 1;
+ npages += huge ? 512 : 1;
}
- vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
- vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
- vcpu->run->hypercall.args[0] = gpa;
- vcpu->run->hypercall.args[1] = npages;
- vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
- ? KVM_MAP_GPA_RANGE_ENCRYPTED
- : KVM_MAP_GPA_RANGE_DECRYPTED;
- vcpu->run->hypercall.args[2] |= entry_start.pagesize
- ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
- : KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
- vcpu->arch.complete_userspace_io = snp_complete_psc;
+ switch (entry_start.operation) {
+ case VMGEXIT_PSC_OP_PRIVATE:
+ case VMGEXIT_PSC_OP_SHARED:
+ vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+ vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
+ vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn);
+ vcpu->run->hypercall.args[1] = npages;
+ vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
+ ? KVM_MAP_GPA_RANGE_ENCRYPTED
+ : KVM_MAP_GPA_RANGE_DECRYPTED;
+ vcpu->run->hypercall.args[2] |= huge
+ ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
+ : KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
+ vcpu->arch.complete_userspace_io = snp_complete_one_psc;
- return 0; /* forward request to userspace */
+ return 0; /* forward request to userspace */
-out_resume:
- svm->sev_es.psc_idx = 0;
- svm->sev_es.psc_inflight = 0;
- svm->sev_es.psc_2m = false;
- ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret);
+ default:
+ /*
+ * Only shared/private PSC operations are currently supported, so if the
+ * entire range consists of unsupported operations (e.g. SMASH/UNSMASH),
+ * then consider the entire range completed and avoid exiting to
+ * userspace. In theory snp_complete_psc() can be called directly
+ * at this point to complete the current range and start the next one,
+ * but that could lead to unexpected levels of recursion.
+ */
+ __snp_complete_one_psc(svm);
+ goto next_range;
+ }
- return 1; /* resume guest */
+ unreachable();
}
static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
On Fri, May 10, 2024 at 07:09:07PM +0200, Paolo Bonzini wrote:
> On 5/10/24 03:58, Michael Roth wrote:
> > There are a few edge-cases that the current processing for GHCB PSC
> > requests doesn't handle properly:
> >
> > - KVM properly ignores SMASH/UNSMASH ops when they are embedded in a
> > PSC request buffer which contains other PSC operations, but
> > inadvertantly forwards them to userspace as private->shared PSC
> > requests if they appear at the end of the buffer. Make sure these are
> > ignored instead, just like cases where they are not at the end of the
> > request buffer.
> >
> > - Current code handles non-zero 'cur_page' fields when they are at the
> > beginning of a new GPA range, but it is not handling properly when
> > iterating through subsequent entries which are otherwise part of a
> > contiguous range. Fix up the handling so that these entries are not
> > combined into a larger contiguous range that include unintended GPA
> > ranges and are instead processed later as the start of a new
> > contiguous range.
> >
> > - The page size variable used to track 2M entries in KVM for inflight PSCs
> > might be artifically set to a different value, which can lead to
> > unexpected values in the entry's final 'cur_page' update. Use the
> > entry's 'pagesize' field instead to determine what the value of
> > 'cur_page' should be upon completion of processing.
> >
> > While here, also add a small helper for clearing in-flight PSCs
> > variables and fix up comments for better readability.
> >
> > Fixes: 266205d810d2 ("KVM: SEV: Add support to handle Page State Change VMGEXIT")
> > Signed-off-by: Michael Roth <michael.roth@amd.com>
>
> There are some more improvements that can be made to the readability of
> the code... this one is already better than the patch is fixing up, but I
> don't like the code that is in the loop even though it is unconditionally
> followed by "break".
>
> Here's my attempt at replacing this patch, which is really more of a
> rewrite of the whole function... Untested beyond compilation.
Thanks for the suggested rework. I tested with/without 2MB pages and
everything worked as-written. This is the full/squashed patch I plan to
include in the pull request:
https://github.com/mdroth/linux/commit/91f6d31c4dfc88dd1ac378e2db6117b0c982e63c
-Mike
>
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index 35f0bd91f92e..6e612789c35f 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -3555,23 +3555,25 @@ struct psc_buffer {
> static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc);
> -static int snp_complete_psc(struct kvm_vcpu *vcpu)
> +static void snp_complete_psc(struct vcpu_svm *svm, u64 psc_ret)
> +{
> + svm->sev_es.psc_inflight = 0;
> + svm->sev_es.psc_idx = 0;
> + svm->sev_es.psc_2m = 0;
> + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret);
> +}
> +
> +static void __snp_complete_one_psc(struct vcpu_svm *svm)
> {
> - struct vcpu_svm *svm = to_svm(vcpu);
> struct psc_buffer *psc = svm->sev_es.ghcb_sa;
> struct psc_entry *entries = psc->entries;
> struct psc_hdr *hdr = &psc->hdr;
> - __u64 psc_ret;
> __u16 idx;
> - if (vcpu->run->hypercall.ret) {
> - psc_ret = VMGEXIT_PSC_ERROR_GENERIC;
> - goto out_resume;
> - }
> -
> /*
> * Everything in-flight has been processed successfully. Update the
> - * corresponding entries in the guest's PSC buffer.
> + * corresponding entries in the guest's PSC buffer and zero out the
> + * count of in-flight PSC entries.
> */
> for (idx = svm->sev_es.psc_idx; svm->sev_es.psc_inflight;
> svm->sev_es.psc_inflight--, idx++) {
> @@ -3581,17 +3583,22 @@ static int snp_complete_psc(struct kvm_vcpu *vcpu)
> }
> hdr->cur_entry = idx;
> +}
> +
> +static int snp_complete_one_psc(struct kvm_vcpu *vcpu)
> +{
> + struct vcpu_svm *svm = to_svm(vcpu);
> + struct psc_buffer *psc = svm->sev_es.ghcb_sa;
> +
> + if (vcpu->run->hypercall.ret) {
> + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
> + return 1; /* resume guest */
> + }
> +
> + __snp_complete_one_psc(svm);
> /* Handle the next range (if any). */
> return snp_begin_psc(svm, psc);
> -
> -out_resume:
> - svm->sev_es.psc_idx = 0;
> - svm->sev_es.psc_inflight = 0;
> - svm->sev_es.psc_2m = false;
> - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret);
> -
> - return 1; /* resume guest */
> }
> static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
> @@ -3601,18 +3608,20 @@ static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
> struct psc_hdr *hdr = &psc->hdr;
> struct psc_entry entry_start;
> u16 idx, idx_start, idx_end;
> - __u64 psc_ret, gpa;
> + u64 gfn;
> int npages;
> -
> - /* There should be no other PSCs in-flight at this point. */
> - if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) {
> - psc_ret = VMGEXIT_PSC_ERROR_GENERIC;
> - goto out_resume;
> - }
> + bool huge;
> if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) {
> - psc_ret = VMGEXIT_PSC_ERROR_GENERIC;
> - goto out_resume;
> + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
> + return 1;
> + }
> +
> +next_range:
> + /* There should be no other PSCs in-flight at this point. */
> + if (WARN_ON_ONCE(svm->sev_es.psc_inflight)) {
> + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_GENERIC);
> + return 1;
> }
> /*
> @@ -3624,97 +3633,99 @@ static int snp_begin_psc(struct vcpu_svm *svm, struct psc_buffer *psc)
> idx_end = hdr->end_entry;
> if (idx_end >= VMGEXIT_PSC_MAX_COUNT) {
> - psc_ret = VMGEXIT_PSC_ERROR_INVALID_HDR;
> - goto out_resume;
> - }
> -
> - /* Nothing more to process. */
> - if (idx_start > idx_end) {
> - psc_ret = 0;
> - goto out_resume;
> + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_HDR);
> + return 1;
> }
> /* Find the start of the next range which needs processing. */
> for (idx = idx_start; idx <= idx_end; idx++, hdr->cur_entry++) {
> - __u16 cur_page;
> - gfn_t gfn;
> - bool huge;
> -
> entry_start = entries[idx];
> -
> - /* Only private/shared conversions are currently supported. */
> - if (entry_start.operation != VMGEXIT_PSC_OP_PRIVATE &&
> - entry_start.operation != VMGEXIT_PSC_OP_SHARED)
> - continue;
> -
> gfn = entry_start.gfn;
> - cur_page = entry_start.cur_page;
> huge = entry_start.pagesize;
> + npages = huge ? 512 : 1;
> - if ((huge && (cur_page > 512 || !IS_ALIGNED(gfn, 512))) ||
> - (!huge && cur_page > 1)) {
> - psc_ret = VMGEXIT_PSC_ERROR_INVALID_ENTRY;
> - goto out_resume;
> + if (entry_start.cur_page > npages || !IS_ALIGNED(gfn, npages)) {
> + snp_complete_psc(svm, VMGEXIT_PSC_ERROR_INVALID_ENTRY);
> + return 1;
> }
> + if (entry_start.cur_page) {
> + /*
> + * If this is a partially-completed 2M range, force 4K
> + * handling for the remaining pages since they're effectively
> + * split at this point. Subsequent code should ensure this
> + * doesn't get combined with adjacent PSC entries where 2M
> + * handling is still possible.
> + */
> + npages -= entry_start.cur_page;
> + gfn += entry_start.cur_page;
> + huge = false;
> + }
> + if (npages)
> + break;
> +
> /* All sub-pages already processed. */
> - if ((huge && cur_page == 512) || (!huge && cur_page == 1))
> - continue;
> -
> - /*
> - * If this is a partially-completed 2M range, force 4K handling
> - * for the remaining pages since they're effectively split at
> - * this point. Subsequent code should ensure this doesn't get
> - * combined with adjacent PSC entries where 2M handling is still
> - * possible.
> - */
> - svm->sev_es.psc_2m = cur_page ? false : huge;
> - svm->sev_es.psc_idx = idx;
> - svm->sev_es.psc_inflight = 1;
> -
> - gpa = gfn_to_gpa(gfn + cur_page);
> - npages = huge ? 512 - cur_page : 1;
> - break;
> }
> + if (idx > idx_end) {
> + /* Nothing more to process. */
> + snp_complete_psc(svm, 0);
> + return 1;
> + }
> +
> + svm->sev_es.psc_2m = huge;
> + svm->sev_es.psc_idx = idx;
> + svm->sev_es.psc_inflight = 1;
> +
> /*
> * Find all subsequent PSC entries that contain adjacent GPA
> * ranges/operations and can be combined into a single
> * KVM_HC_MAP_GPA_RANGE exit.
> */
> - for (idx = svm->sev_es.psc_idx + 1; idx <= idx_end; idx++) {
> + while (++idx <= idx_end) {
> struct psc_entry entry = entries[idx];
> if (entry.operation != entry_start.operation ||
> - entry.gfn != entry_start.gfn + npages ||
> - !!entry.pagesize != svm->sev_es.psc_2m)
> + entry.gfn != gfn + npages ||
> + entry.cur_page ||
> + !!entry.pagesize != huge)
> break;
> svm->sev_es.psc_inflight++;
> - npages += entry_start.pagesize ? 512 : 1;
> + npages += huge ? 512 : 1;
> }
> - vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
> - vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
> - vcpu->run->hypercall.args[0] = gpa;
> - vcpu->run->hypercall.args[1] = npages;
> - vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
> - ? KVM_MAP_GPA_RANGE_ENCRYPTED
> - : KVM_MAP_GPA_RANGE_DECRYPTED;
> - vcpu->run->hypercall.args[2] |= entry_start.pagesize
> - ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
> - : KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
> - vcpu->arch.complete_userspace_io = snp_complete_psc;
> + switch (entry_start.operation) {
> + case VMGEXIT_PSC_OP_PRIVATE:
> + case VMGEXIT_PSC_OP_SHARED:
> + vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
> + vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE;
> + vcpu->run->hypercall.args[0] = gfn_to_gpa(gfn);
> + vcpu->run->hypercall.args[1] = npages;
> + vcpu->run->hypercall.args[2] = entry_start.operation == VMGEXIT_PSC_OP_PRIVATE
> + ? KVM_MAP_GPA_RANGE_ENCRYPTED
> + : KVM_MAP_GPA_RANGE_DECRYPTED;
> + vcpu->run->hypercall.args[2] |= huge
> + ? KVM_MAP_GPA_RANGE_PAGE_SZ_2M
> + : KVM_MAP_GPA_RANGE_PAGE_SZ_4K;
> + vcpu->arch.complete_userspace_io = snp_complete_one_psc;
> - return 0; /* forward request to userspace */
> + return 0; /* forward request to userspace */
> -out_resume:
> - svm->sev_es.psc_idx = 0;
> - svm->sev_es.psc_inflight = 0;
> - svm->sev_es.psc_2m = false;
> - ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, psc_ret);
> + default:
> + /*
> + * Only shared/private PSC operations are currently supported, so if the
> + * entire range consists of unsupported operations (e.g. SMASH/UNSMASH),
> + * then consider the entire range completed and avoid exiting to
> + * userspace. In theory snp_complete_psc() can be called directly
> + * at this point to complete the current range and start the next one,
> + * but that could lead to unexpected levels of recursion.
> + */
> + __snp_complete_one_psc(svm);
> + goto next_range;
> + }
> - return 1; /* resume guest */
> + unreachable();
> }
> static int __sev_snp_update_protected_guest_state(struct kvm_vcpu *vcpu)
>
>
© 2016 - 2025 Red Hat, Inc.