On AMD CPUs without ensuring cache consistency, each memory page
reclamation in an SEV guest triggers a call to wbinvd_on_all_cpus(),
thereby affecting the performance of other programs on the host.
Typically, an AMD server may have 128 cores or more, while the SEV guest
might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity
to bind these 8 vCPUs to specific physical CPUs.
Therefore, keeping a record of the physical core numbers each time a vCPU
runs can help avoid flushing the cache for all CPUs every time.
Since the usage of sev_flush_asids() isn't tied to a single VM, we just
replace all wbinvd_on_all_cpus() with sev_do_wbinvd() except for that
in sev_flush_asids().
Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn>
---
arch/x86/kvm/svm/sev.c | 48 ++++++++++++++++++++++++++++++++++++++----
arch/x86/kvm/svm/svm.c | 2 ++
arch/x86/kvm/svm/svm.h | 4 ++++
3 files changed, 50 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index f760106c3..3a129aa61 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -215,6 +215,42 @@ static void sev_asid_free(struct kvm_sev_info *sev)
sev->misc_cg = NULL;
}
+static struct cpumask *sev_get_wbinvd_dirty_mask(struct kvm *kvm)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+
+ return sev->wbinvd_dirty_mask;
+}
+
+void sev_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ /*
+ * The per-VM wbinvd_dirty_mask should record all physical CPUs
+ * that are running a SEV guest and be used in memory reclamation.
+ *
+ * Migrating vCPUs between pCPUs is tricky. We cannot clear
+ * this mask each time reclamation finishes and record it again
+ * before VMRUN, because we cannot guarantee the pCPU will exit
+ * to VMM before the next reclamation happens.
+ *
+ * Thus we just keep stale pCPU numbers in the mask if vCPU
+ * migration happens.
+ */
+ cpumask_set_cpu(cpu, sev_get_wbinvd_dirty_mask(vcpu->kvm));
+}
+
+static void sev_do_wbinvd(struct kvm *kvm)
+{
+ struct cpumask *dirty_mask = sev_get_wbinvd_dirty_mask(kvm);
+
+ /*
+ * Although dirty_mask is not maintained perfectly and may lead
+ * to wbinvd on physical CPUs that are not running a SEV guest,
+ * it's still better than wbinvd_on_all_cpus().
+ */
+ wbinvd_on_many_cpus(dirty_mask);
+}
+
static void sev_decommission(unsigned int handle)
{
struct sev_data_decommission decommission;
@@ -265,6 +301,9 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
ret = sev_platform_init(&argp->error);
if (ret)
goto e_free;
+ if (!zalloc_cpumask_var(&sev->wbinvd_dirty_mask, GFP_KERNEL_ACCOUNT))
+ goto e_free;
+
INIT_LIST_HEAD(&sev->regions_list);
INIT_LIST_HEAD(&sev->mirror_vms);
@@ -2048,7 +2087,7 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
* releasing the pages back to the system for use. CLFLUSH will
* not do this, so issue a WBINVD.
*/
- wbinvd_on_all_cpus();
+ sev_do_wbinvd(kvm);
__unregister_enc_region_locked(kvm, region);
@@ -2152,7 +2191,7 @@ void sev_vm_destroy(struct kvm *kvm)
* releasing the pages back to the system for use. CLFLUSH will
* not do this, so issue a WBINVD.
*/
- wbinvd_on_all_cpus();
+ sev_do_wbinvd(kvm);
/*
* if userspace was terminated before unregistering the memory regions
@@ -2168,6 +2207,7 @@ void sev_vm_destroy(struct kvm *kvm)
sev_unbind_asid(kvm, sev->handle);
sev_asid_free(sev);
+ free_cpumask_var(sev->wbinvd_dirty_mask);
}
void __init sev_set_cpu_caps(void)
@@ -2343,7 +2383,7 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
return;
do_wbinvd:
- wbinvd_on_all_cpus();
+ sev_do_wbinvd(vcpu->kvm);
}
void sev_guest_memory_reclaimed(struct kvm *kvm)
@@ -2351,7 +2391,7 @@ void sev_guest_memory_reclaimed(struct kvm *kvm)
if (!sev_guest(kvm))
return;
- wbinvd_on_all_cpus();
+ sev_do_wbinvd(kvm);
}
void sev_free_vcpu(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e90b429c8..6ec118df3 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1560,6 +1560,8 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
}
if (kvm_vcpu_apicv_active(vcpu))
avic_vcpu_load(vcpu, cpu);
+ if (sev_guest(vcpu->kvm))
+ sev_vcpu_load(vcpu, cpu);
}
static void svm_vcpu_put(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 8ef95139c..dfb889c91 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -90,6 +90,9 @@ struct kvm_sev_info {
struct list_head mirror_entry; /* Use as a list entry of mirrors */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
atomic_t migration_in_progress;
+
+ /* CPUs invoked VMRUN should do wbinvd after guest memory is reclaimed */
+ struct cpumask *wbinvd_dirty_mask;
};
struct kvm_svm {
@@ -694,6 +697,7 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm);
void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
+void sev_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
/* vmenter.S */
--
2.34.1
+Kevin
My apologies for the very slow review.
On Thu, Apr 11, 2024, Zheyun Shen wrote:
> On AMD CPUs without ensuring cache consistency, each memory page
> reclamation in an SEV guest triggers a call to wbinvd_on_all_cpus(),
> thereby affecting the performance of other programs on the host.
>
> Typically, an AMD server may have 128 cores or more, while the SEV guest
> might only utilize 8 of these cores. Meanwhile, host can use qemu-affinity
> to bind these 8 vCPUs to specific physical CPUs.
>
> Therefore, keeping a record of the physical core numbers each time a vCPU
> runs can help avoid flushing the cache for all CPUs every time.
>
> Since the usage of sev_flush_asids() isn't tied to a single VM, we just
> replace all wbinvd_on_all_cpus() with sev_do_wbinvd() except for that
> in sev_flush_asids().
>
> Signed-off-by: Zheyun Shen <szy0127@sjtu.edu.cn>
> ---
> arch/x86/kvm/svm/sev.c | 48 ++++++++++++++++++++++++++++++++++++++----
> arch/x86/kvm/svm/svm.c | 2 ++
> arch/x86/kvm/svm/svm.h | 4 ++++
> 3 files changed, 50 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index f760106c3..3a129aa61 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -215,6 +215,42 @@ static void sev_asid_free(struct kvm_sev_info *sev)
> sev->misc_cg = NULL;
> }
>
> +static struct cpumask *sev_get_wbinvd_dirty_mask(struct kvm *kvm)
> +{
> + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> +
> + return sev->wbinvd_dirty_mask;
> +}
> +
> +void sev_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> +{
> + /*
> + * The per-VM wbinvd_dirty_mask should record all physical CPUs
Don't hedge with "should", just state KVM's behavior. E.g.
/*
* To optimize cache flushes when memory is reclaimed from an SEV VM,
* track physical CPUs that enter the guest for SEV VMs and thus can
* have encrypted, dirty data in the cache, and flush caches only for
* CPUs that have entered the guest.
*/
> + * that are running a SEV guest and be used in memory reclamation.
> + *
> + * Migrating vCPUs between pCPUs is tricky. We cannot clear
> + * this mask each time reclamation finishes and record it again
> + * before VMRUN, because we cannot guarantee the pCPU will exit
> + * to VMM before the next reclamation happens.
Migration is easy enough to solve (I think; famous last words). KVM is already
forcing an exit to service the IPI, just set the associated pCPU's bit if it has
a running vCPU loaded.
However, to play nice with multiple flushers, we'd need something like
kvm_recalculate_apic_map() to ensure subsequent flushers wait for previous
flushers to finish before reading the cpumask. Maybe a simple mutex would
suffice? Contention should be extremely rare for well-behaved setups.
Kevin, since I believe your use case cares about vCPU migration, is this
something you'd be interesting in tackling? It can go on top, i.e. I don't think
this base series needs to be held up for fancier migration handling, it's a clear
improvement over blasting WBINVD to all CPUs.
> + *
> + * Thus we just keep stale pCPU numbers in the mask if vCPU
> + * migration happens.
> + */
This can be conditioned on vcpu->wants_to_run, so that loading a vCPU outside of
KVM_RUN doesn't trigger WBINVD.
> + cpumask_set_cpu(cpu, sev_get_wbinvd_dirty_mask(vcpu->kvm));
> +}
> +
> +static void sev_do_wbinvd(struct kvm *kvm)
> +{
> + struct cpumask *dirty_mask = sev_get_wbinvd_dirty_mask(kvm);
> +
> + /*
> + * Although dirty_mask is not maintained perfectly and may lead
> + * to wbinvd on physical CPUs that are not running a SEV guest,
> + * it's still better than wbinvd_on_all_cpus().
This belongs in the changelog not as a comment. This would be a good spot to add
the:
/*
* TODO: Clear CPUs from the bitmap prior to flushing. Doing so
* requires serializing multiple calls and having CPUs mark themselves
* "dirty" if they are currently running a vCPU for the VM.
*/
> + */
> + wbinvd_on_many_cpus(dirty_mask);
> +}
> +
> static void sev_decommission(unsigned int handle)
> {
> struct sev_data_decommission decommission;
> @@ -265,6 +301,9 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
> ret = sev_platform_init(&argp->error);
> if (ret)
> goto e_free;
> + if (!zalloc_cpumask_var(&sev->wbinvd_dirty_mask, GFP_KERNEL_ACCOUNT))
> + goto e_free;
> +
>
> INIT_LIST_HEAD(&sev->regions_list);
> INIT_LIST_HEAD(&sev->mirror_vms);
> @@ -2048,7 +2087,7 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
> * releasing the pages back to the system for use. CLFLUSH will
> * not do this, so issue a WBINVD.
> */
> - wbinvd_on_all_cpus();
> + sev_do_wbinvd(kvm);
Hmm, I am not convinced that optimizing sev_mem_enc_unregister_region() is worth
doing. Nothing here prevents a vCPU from racing with unregistering the region.
That said, this path isn't exactly safe as it is, because KVM essentially relies
on userspace to do the right thing. And userspace can only corrupt itself,
because the memory hasn't actually been freed, just unpinned. If userspace hasn't
ensured the guest can't access the memory, it's already hosed, so I supposed we
might as well, because why not.
All the other paths in KVM ensure vCPUs don't have access to the relevent regions,
*before* doing WBINVD.
> __unregister_enc_region_locked(kvm, region);
>
> @@ -2152,7 +2191,7 @@ void sev_vm_destroy(struct kvm *kvm)
> * releasing the pages back to the system for use. CLFLUSH will
> * not do this, so issue a WBINVD.
> */
> - wbinvd_on_all_cpus();
> + sev_do_wbinvd(kvm);
I am 99% certain this wbinvd_on_all_cpus() can simply be dropped. sev_vm_destroy()
is called after KVM's mmu_notifier has been unregistered, which means it's called
after kvm_mmu_notifier_release() => kvm_arch_guest_memory_reclaimed().
> /*
> * if userspace was terminated before unregistering the memory regions
> @@ -2168,6 +2207,7 @@ void sev_vm_destroy(struct kvm *kvm)
>
> sev_unbind_asid(kvm, sev->handle);
> sev_asid_free(sev);
> + free_cpumask_var(sev->wbinvd_dirty_mask);
> }
>
> void __init sev_set_cpu_caps(void)
> @@ -2343,7 +2383,7 @@ static void sev_flush_encrypted_page(struct kvm_vcpu *vcpu, void *va)
Similar to sev_vm_destroy(), I'm quite certain sev_flush_encrypted_page() is
completely superfluous. It's used only by sev_free_vcpu(), and sev_free_vcpu()
is called after kvm_mmu_notifier_release(). sev_free_vcpu() is also called when
vCPU creation fails, but the vCPU can't have entered the guest in that case, not
to mention its VMSA can't have been encrypted.
So I think we can delete this one too.
> return;
>
> do_wbinvd:
> - wbinvd_on_all_cpus();
> + sev_do_wbinvd(vcpu->kvm);
> }
>
> void sev_guest_memory_reclaimed(struct kvm *kvm)
> @@ -2351,7 +2391,7 @@ void sev_guest_memory_reclaimed(struct kvm *kvm)
> if (!sev_guest(kvm))
> return;
>
> - wbinvd_on_all_cpus();
> + sev_do_wbinvd(kvm);
> }
>
> void sev_free_vcpu(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index e90b429c8..6ec118df3 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -1560,6 +1560,8 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> }
> if (kvm_vcpu_apicv_active(vcpu))
> avic_vcpu_load(vcpu, cpu);
> + if (sev_guest(vcpu->kvm))
> + sev_vcpu_load(vcpu, cpu);
> }
>
> static void svm_vcpu_put(struct kvm_vcpu *vcpu)
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index 8ef95139c..dfb889c91 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -90,6 +90,9 @@ struct kvm_sev_info {
> struct list_head mirror_entry; /* Use as a list entry of mirrors */
> struct misc_cg *misc_cg; /* For misc cgroup accounting */
> atomic_t migration_in_progress;
> +
> + /* CPUs invoked VMRUN should do wbinvd after guest memory is reclaimed */
> + struct cpumask *wbinvd_dirty_mask;
> };
>
> struct kvm_svm {
> @@ -694,6 +697,7 @@ void sev_es_vcpu_reset(struct vcpu_svm *svm);
> void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
> void sev_es_prepare_switch_to_guest(struct sev_es_save_area *hostsa);
> void sev_es_unmap_ghcb(struct vcpu_svm *svm);
> +void sev_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
>
> /* vmenter.S */
>
> --
> 2.34.1
>
On Tue, Dec 3, 2024 at 4:27 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, Apr 11, 2024, Zheyun Shen wrote:
> >
> > + * that are running a SEV guest and be used in memory reclamation.
> > + *
> > + * Migrating vCPUs between pCPUs is tricky. We cannot clear
> > + * this mask each time reclamation finishes and record it again
> > + * before VMRUN, because we cannot guarantee the pCPU will exit
> > + * to VMM before the next reclamation happens.
>
> Migration is easy enough to solve (I think; famous last words). KVM is already
> forcing an exit to service the IPI, just set the associated pCPU's bit if it has
> a running vCPU loaded.
>
> However, to play nice with multiple flushers, we'd need something like
> kvm_recalculate_apic_map() to ensure subsequent flushers wait for previous
> flushers to finish before reading the cpumask. Maybe a simple mutex would
> suffice? Contention should be extremely rare for well-behaved setups.
>
> Kevin, since I believe your use case cares about vCPU migration, is this
> something you'd be interesting in tackling? It can go on top, i.e. I don't think
> this base series needs to be held up for fancier migration handling, it's a clear
> improvement over blasting WBINVD to all CPUs.
I'm happy to take a look. I agree with your initial assessment of what
needs to be done. I also agree that we don't need to hold up this
series for it, nor should we hold up [0] ("KVM: SEV: Prefer WBNOINVD
over WBINVD for cache maintenance efficiency") (assuming that patch
series checks out, of course).
[0] https://lore.kernel.org/lkml/20241203005921.1119116-1-kevinloughlin@google.com/
> > @@ -2048,7 +2087,7 @@ int sev_mem_enc_unregister_region(struct kvm *kvm,
> > * releasing the pages back to the system for use. CLFLUSH will
> > * not do this, so issue a WBINVD.
> > */
> > - wbinvd_on_all_cpus();
> > + sev_do_wbinvd(kvm);
>
> Hmm, I am not convinced that optimizing sev_mem_enc_unregister_region() is worth
> doing. Nothing here prevents a vCPU from racing with unregistering the region.
> That said, this path isn't exactly safe as it is, because KVM essentially relies
> on userspace to do the right thing. And userspace can only corrupt itself,
> because the memory hasn't actually been freed, just unpinned. If userspace hasn't
> ensured the guest can't access the memory, it's already hosed, so I supposed we
> might as well, because why not.
Yeah, I see your point but likewise vote for "might as well" for now.
There's some additional (arguably orthogonal) cleanup in general that
could be done that I believe is best handled in a separate series
(discussed below).
> > @@ -2152,7 +2191,7 @@ void sev_vm_destroy(struct kvm *kvm)
> > * releasing the pages back to the system for use. CLFLUSH will
> > * not do this, so issue a WBINVD.
> > */
> > - wbinvd_on_all_cpus();
> > + sev_do_wbinvd(kvm);
>
> I am 99% certain this wbinvd_on_all_cpus() can simply be dropped. sev_vm_destroy()
> is called after KVM's mmu_notifier has been unregistered, which means it's called
> after kvm_mmu_notifier_release() => kvm_arch_guest_memory_reclaimed().
I think we need a bit of rework before dropping it (which I propose we
do in a separate series), but let me know if there's a mistake in my
reasoning here...
Right now, sev_guest_memory_reclaimed() issues writebacks for SEV and
SEV-ES guests but does *not* issue writebacks for SEV-SNP guests.
Thus, I believe it's possible a SEV-SNP guest reaches sev_vm_destroy()
with dirty encrypted lines in processor caches. Because SME_COHERENT
doesn't guarantee coherence across CPU-DMA interactions (d45829b351ee
("KVM: SVM: Flush when freeing encrypted pages even on SME_COHERENT
CPUs")), it seems possible that the memory gets re-allocated for DMA,
written back from an (unencrypted) DMA, and then corrupted when the
dirty encrypted version gets written back over that, right?
And potentially the same thing for why we can't yet drop the writeback
in sev_flush_encrypted_page() without a bit of rework?
It's true that the SNP firmware will require WBINVD before
SNP_DF_FLUSH [1], but I think we're only currently doing that when an
ASID is recycled, *not* when an ASID is deactivated.
[1] https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56860.pdf
On Wed, Dec 04, 2024, Kevin Loughlin wrote:
> On Tue, Dec 3, 2024 at 4:27 PM Sean Christopherson <seanjc@google.com> wrote:
> > > @@ -2152,7 +2191,7 @@ void sev_vm_destroy(struct kvm *kvm)
> > > * releasing the pages back to the system for use. CLFLUSH will
> > > * not do this, so issue a WBINVD.
> > > */
> > > - wbinvd_on_all_cpus();
> > > + sev_do_wbinvd(kvm);
> >
> > I am 99% certain this wbinvd_on_all_cpus() can simply be dropped. sev_vm_destroy()
> > is called after KVM's mmu_notifier has been unregistered, which means it's called
> > after kvm_mmu_notifier_release() => kvm_arch_guest_memory_reclaimed().
>
> I think we need a bit of rework before dropping it (which I propose we
> do in a separate series), but let me know if there's a mistake in my
> reasoning here...
>
> Right now, sev_guest_memory_reclaimed() issues writebacks for SEV and
> SEV-ES guests but does *not* issue writebacks for SEV-SNP guests.
> Thus, I believe it's possible a SEV-SNP guest reaches sev_vm_destroy()
> with dirty encrypted lines in processor caches. Because SME_COHERENT
> doesn't guarantee coherence across CPU-DMA interactions (d45829b351ee
> ("KVM: SVM: Flush when freeing encrypted pages even on SME_COHERENT
> CPUs")), it seems possible that the memory gets re-allocated for DMA,
> written back from an (unencrypted) DMA, and then corrupted when the
> dirty encrypted version gets written back over that, right?
>
> And potentially the same thing for why we can't yet drop the writeback
> in sev_flush_encrypted_page() without a bit of rework?
Argh, this last one probably does apply to SNP. KVM requires SNP VMs to be backed
with guest_memfd, and flushing for that memory is handled by sev_gmem_invalidate().
But the VMSA is kernel allocated and so needs to be flushed manually. On the plus
side, the VMSA flush shouldn't use WB{NO}INVD unless things go sideways, so trying
to optimize that path isn't worth doing.
> It's true that the SNP firmware will require WBINVD before
> SNP_DF_FLUSH [1], but I think we're only currently doing that when an
> ASID is recycled, *not* when an ASID is deactivated.
>
> [1] https://www.amd.com/content/dam/amd/en/documents/epyc-technical-docs/specifications/56860.pdf
On Wed, Dec 4, 2024 at 2:07 PM Sean Christopherson <seanjc@google.com> wrote:
>
> On Wed, Dec 04, 2024, Kevin Loughlin wrote:
> > On Tue, Dec 3, 2024 at 4:27 PM Sean Christopherson <seanjc@google.com> wrote:
> > > > @@ -2152,7 +2191,7 @@ void sev_vm_destroy(struct kvm *kvm)
> > > > * releasing the pages back to the system for use. CLFLUSH will
> > > > * not do this, so issue a WBINVD.
> > > > */
> > > > - wbinvd_on_all_cpus();
> > > > + sev_do_wbinvd(kvm);
> > >
> > > I am 99% certain this wbinvd_on_all_cpus() can simply be dropped. sev_vm_destroy()
> > > is called after KVM's mmu_notifier has been unregistered, which means it's called
> > > after kvm_mmu_notifier_release() => kvm_arch_guest_memory_reclaimed().
> >
> > I think we need a bit of rework before dropping it (which I propose we
> > do in a separate series), but let me know if there's a mistake in my
> > reasoning here...
> >
> > Right now, sev_guest_memory_reclaimed() issues writebacks for SEV and
> > SEV-ES guests but does *not* issue writebacks for SEV-SNP guests.
> > Thus, I believe it's possible a SEV-SNP guest reaches sev_vm_destroy()
> > with dirty encrypted lines in processor caches. Because SME_COHERENT
> > doesn't guarantee coherence across CPU-DMA interactions (d45829b351ee
> > ("KVM: SVM: Flush when freeing encrypted pages even on SME_COHERENT
> > CPUs")), it seems possible that the memory gets re-allocated for DMA,
> > written back from an (unencrypted) DMA, and then corrupted when the
> > dirty encrypted version gets written back over that, right?
> >
> > And potentially the same thing for why we can't yet drop the writeback
> > in sev_flush_encrypted_page() without a bit of rework?
>
> Argh, this last one probably does apply to SNP. KVM requires SNP VMs to be backed
> with guest_memfd, and flushing for that memory is handled by sev_gmem_invalidate().
> But the VMSA is kernel allocated and so needs to be flushed manually. On the plus
> side, the VMSA flush shouldn't use WB{NO}INVD unless things go sideways, so trying
> to optimize that path isn't worth doing.
Ah thanks, yes agreed for both (that dropping WB{NO}INVD is fine on
the sev_vm_destroy() path given sev_gmem_invalidate() and that the
sev_flush_encrypted_page() path still needs the WB{NO}INVD as a
fallback for now).
On that note, the WBINVD in sev_mem_enc_unregister_region() can be
dropped too then, right? My understanding is that the host will
instead do WB{NO}INVD for SEV(-ES) guests in
sev_guest_memory_reclaimed(), and sev_gmem_invalidate() will handle
SEV-SNP guests.
All in all, I now agree we can drop the unneeded case(s) of issuing
WB{NO}INVDs in this series in an additional commit. I'll then rebase
[0] on the latest version of this series and can also work on the
migration optimizations atop all of it, if that works for you Sean.
[0] https://lore.kernel.org/lkml/20241203005921.1119116-1-kevinloughlin@google.com/
Thanks!
On Tue, Dec 10, 2024 at 3:56 PM Kevin Loughlin <kevinloughlin@google.com> wrote:
>
> On Wed, Dec 4, 2024 at 2:07 PM Sean Christopherson <seanjc@google.com> wrote:
> >
> > On Wed, Dec 04, 2024, Kevin Loughlin wrote:
> > > On Tue, Dec 3, 2024 at 4:27 PM Sean Christopherson <seanjc@google.com> wrote:
> > > > > @@ -2152,7 +2191,7 @@ void sev_vm_destroy(struct kvm *kvm)
> > > > > * releasing the pages back to the system for use. CLFLUSH will
> > > > > * not do this, so issue a WBINVD.
> > > > > */
> > > > > - wbinvd_on_all_cpus();
> > > > > + sev_do_wbinvd(kvm);
> > > >
> > > > I am 99% certain this wbinvd_on_all_cpus() can simply be dropped. sev_vm_destroy()
> > > > is called after KVM's mmu_notifier has been unregistered, which means it's called
> > > > after kvm_mmu_notifier_release() => kvm_arch_guest_memory_reclaimed().
> > >
> > > I think we need a bit of rework before dropping it (which I propose we
> > > do in a separate series), but let me know if there's a mistake in my
> > > reasoning here...
> > >
> > > Right now, sev_guest_memory_reclaimed() issues writebacks for SEV and
> > > SEV-ES guests but does *not* issue writebacks for SEV-SNP guests.
> > > Thus, I believe it's possible a SEV-SNP guest reaches sev_vm_destroy()
> > > with dirty encrypted lines in processor caches. Because SME_COHERENT
> > > doesn't guarantee coherence across CPU-DMA interactions (d45829b351ee
> > > ("KVM: SVM: Flush when freeing encrypted pages even on SME_COHERENT
> > > CPUs")), it seems possible that the memory gets re-allocated for DMA,
> > > written back from an (unencrypted) DMA, and then corrupted when the
> > > dirty encrypted version gets written back over that, right?
> > >
> > > And potentially the same thing for why we can't yet drop the writeback
> > > in sev_flush_encrypted_page() without a bit of rework?
> >
> > Argh, this last one probably does apply to SNP. KVM requires SNP VMs to be backed
> > with guest_memfd, and flushing for that memory is handled by sev_gmem_invalidate().
> > But the VMSA is kernel allocated and so needs to be flushed manually. On the plus
> > side, the VMSA flush shouldn't use WB{NO}INVD unless things go sideways, so trying
> > to optimize that path isn't worth doing.
>
> Ah thanks, yes agreed for both (that dropping WB{NO}INVD is fine on
> the sev_vm_destroy() path given sev_gmem_invalidate() and that the
> sev_flush_encrypted_page() path still needs the WB{NO}INVD as a
> fallback for now).
>
> On that note, the WBINVD in sev_mem_enc_unregister_region() can be
> dropped too then, right? My understanding is that the host will
> instead do WB{NO}INVD for SEV(-ES) guests in
> sev_guest_memory_reclaimed(), and sev_gmem_invalidate() will handle
> SEV-SNP guests.
Nevermind, we can't drop the WBINVD call in
sev_mem_enc_unregister_region() without a userspace opt-in because
userspace might otherwise rely on the flushing behavior; see Sean's
explanation in [0].
So all-in-all I believe...
- we can drop the call in sev_vm_destroy()
- we *cannot* drop the call in sev_flush_encrypted_page(), nor in
sev_mem_enc_unregister_region().
Zheyun, if you get to this series before my own WBNOINVD series [1], I
can just rebase on top of yours. I will defer cutting these unneeded
calls to you and simply replace applicable WBINVD calls with WBNOINVD
in my series.
[0] https://lore.kernel.org/all/ZWrM622xUb4pe7gS@google.com/T/#md364d1fdfc65dc92e306276bd51298cb817c5e53.
[1] https://lore.kernel.org/kvm/20250109225533.1841097-2-kevinloughlin@google.com/T/
>
> All in all, I now agree we can drop the unneeded case(s) of issuing
> WB{NO}INVDs in this series in an additional commit. I'll then rebase
> [0] on the latest version of this series and can also work on the
> migration optimizations atop all of it, if that works for you Sean.
>
> [0] https://lore.kernel.org/lkml/20241203005921.1119116-1-kevinloughlin@google.com/
>
> Thanks!
© 2016 - 2026 Red Hat, Inc.