Since guest_memfd now supports mmap(), folios have to be prepared
before they are faulted into userspace.
When memory attributes are switched between shared and private, the
up-to-date flags will be cleared.
Use the folio's up-to-date flag to indicate being ready for the guest
usage and can be used to mark whether the folio is ready for shared OR
private use.
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
---
virt/kvm/guest_memfd.c | 131 ++++++++++++++++++++++++++++++++++++++++-
virt/kvm/kvm_main.c | 2 +
virt/kvm/kvm_mm.h | 7 +++
3 files changed, 139 insertions(+), 1 deletion(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 110c4bbb004b..fb292e542381 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -129,13 +129,29 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
}
/**
- * Use the uptodate flag to indicate that the folio is prepared for KVM's usage.
+ * Use folio's up-to-date flag to indicate that this folio is prepared for usage
+ * by the guest.
+ *
+ * This flag can be used whether the folio is prepared for PRIVATE or SHARED
+ * usage.
*/
static inline void kvm_gmem_mark_prepared(struct folio *folio)
{
folio_mark_uptodate(folio);
}
+/**
+ * Use folio's up-to-date flag to indicate that this folio is not yet prepared for
+ * usage by the guest.
+ *
+ * This flag can be used whether the folio is prepared for PRIVATE or SHARED
+ * usage.
+ */
+static inline void kvm_gmem_clear_prepared(struct folio *folio)
+{
+ folio_clear_uptodate(folio);
+}
+
/*
* Process @folio, which contains @gfn, so that the guest can use it.
* The folio must be locked and the gfn must be contained in @slot.
@@ -148,6 +164,12 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
pgoff_t index;
int r;
+ /*
+ * Defensively zero folio to avoid leaking kernel memory in
+ * uninitialized pages. This is important since pages can now be mapped
+ * into userspace, where hardware (e.g. TDX) won't be clearing those
+ * pages.
+ */
if (folio_test_hugetlb(folio)) {
folio_zero_user(folio, folio->index << PAGE_SHIFT);
} else {
@@ -1017,6 +1039,7 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
{
struct inode *inode;
struct folio *folio;
+ bool is_prepared;
inode = file_inode(vmf->vma->vm_file);
if (!kvm_gmem_is_faultable(inode, vmf->pgoff))
@@ -1026,6 +1049,31 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
if (!folio)
return VM_FAULT_SIGBUS;
+ is_prepared = folio_test_uptodate(folio);
+ if (!is_prepared) {
+ unsigned long nr_pages;
+ unsigned long i;
+
+ if (folio_test_hugetlb(folio)) {
+ folio_zero_user(folio, folio->index << PAGE_SHIFT);
+ } else {
+ /*
+ * Defensively zero folio to avoid leaking kernel memory in
+ * uninitialized pages. This is important since pages can now be
+ * mapped into userspace, where hardware (e.g. TDX) won't be
+ * clearing those pages.
+ *
+ * Will probably need a version of kvm_gmem_prepare_folio() to
+ * prepare the page for SHARED use.
+ */
+ nr_pages = folio_nr_pages(folio);
+ for (i = 0; i < nr_pages; i++)
+ clear_highpage(folio_page(folio, i));
+ }
+
+ kvm_gmem_mark_prepared(folio);
+ }
+
vmf->page = folio_file_page(folio, vmf->pgoff);
return VM_FAULT_LOCKED;
}
@@ -1593,6 +1641,87 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
}
EXPORT_SYMBOL_GPL(kvm_gmem_populate);
+static void kvm_gmem_clear_prepared_range(struct inode *inode, pgoff_t start,
+ pgoff_t end)
+{
+ pgoff_t index;
+
+ filemap_invalidate_lock_shared(inode->i_mapping);
+
+ /* TODO: replace iteration with filemap_get_folios() for efficiency. */
+ for (index = start; index < end;) {
+ struct folio *folio;
+
+ /* Don't use kvm_gmem_get_folio to avoid allocating */
+ folio = filemap_lock_folio(inode->i_mapping, index);
+ if (IS_ERR(folio)) {
+ ++index;
+ continue;
+ }
+
+ kvm_gmem_clear_prepared(folio);
+
+ index = folio_next_index(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ filemap_invalidate_unlock_shared(inode->i_mapping);
+}
+
+/**
+ * Clear the prepared flag for all folios in gfn range [@start, @end) in memslot
+ * @slot.
+ */
+static void kvm_gmem_clear_prepared_slot(struct kvm_memory_slot *slot, gfn_t start,
+ gfn_t end)
+{
+ pgoff_t start_offset;
+ pgoff_t end_offset;
+ struct file *file;
+
+ file = kvm_gmem_get_file(slot);
+ if (!file)
+ return;
+
+ start_offset = start - slot->base_gfn + slot->gmem.pgoff;
+ end_offset = end - slot->base_gfn + slot->gmem.pgoff;
+
+ kvm_gmem_clear_prepared_range(file_inode(file), start_offset, end_offset);
+
+ fput(file);
+}
+
+/**
+ * Clear the prepared flag for all folios for any slot in gfn range
+ * [@start, @end) in @kvm.
+ */
+void kvm_gmem_clear_prepared_vm(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+ int i;
+
+ lockdep_assert_held(&kvm->slots_lock);
+
+ for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+ struct kvm_memslot_iter iter;
+ struct kvm_memslots *slots;
+
+ slots = __kvm_memslots(kvm, i);
+ kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
+ struct kvm_memory_slot *slot;
+ gfn_t gfn_start;
+ gfn_t gfn_end;
+
+ slot = iter.slot;
+ gfn_start = max(start, slot->base_gfn);
+ gfn_end = min(end, slot->base_gfn + slot->npages);
+
+ if (iter.slot->flags & KVM_MEM_GUEST_MEMFD)
+ kvm_gmem_clear_prepared_slot(iter.slot, gfn_start, gfn_end);
+ }
+ }
+}
+
/**
* Returns true if pages in range [@start, @end) in inode @inode have no
* userspace mappings.
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1a7bbcc31b7e..255d27df7f5c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2565,6 +2565,8 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
KVM_BUG_ON(r, kvm);
}
+ kvm_gmem_clear_prepared_vm(kvm, start, end);
+
kvm_handle_gfn_range(kvm, &post_set_range);
out_unlock:
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index d8ff2b380d0e..25fd0d9f66cc 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -43,6 +43,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
void kvm_gmem_unbind(struct kvm_memory_slot *slot);
int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
unsigned long attrs);
+void kvm_gmem_clear_prepared_vm(struct kvm *kvm, gfn_t start, gfn_t end);
#else
static inline void kvm_gmem_init(struct module *module)
{
@@ -68,6 +69,12 @@ static inline int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start,
return 0;
}
+static inline void kvm_gmem_clear_prepared_slots(struct kvm *kvm,
+ gfn_t start, gfn_t end)
+{
+ WARN_ON_ONCE(1);
+}
+
#endif /* CONFIG_KVM_PRIVATE_MEM */
#endif /* __KVM_MM_H__ */
--
2.46.0.598.g6f2099f65c-goog
On Tue, Sep 10, 2024 at 11:44:01PM +0000, Ackerley Tng wrote: > Since guest_memfd now supports mmap(), folios have to be prepared > before they are faulted into userspace. > > When memory attributes are switched between shared and private, the > up-to-date flags will be cleared. > > Use the folio's up-to-date flag to indicate being ready for the guest > usage and can be used to mark whether the folio is ready for shared OR > private use. Clearing the up-to-date flag also means that the page gets zero'd out whenever it transitions between shared and private (either direction). pKVM (Android) hypervisor policy can allow in-place conversion between shared/private. I believe the important thing is that sev_gmem_prepare() needs to be called prior to giving page to guest. In my series, I had made a ->prepare_inaccessible() callback where KVM would only do this part. When transitioning to inaccessible, only that callback would be made, besides the bookkeeping. The folio zeroing happens once when allocating the folio if the folio is initially accessible (faultable). From x86 CoCo perspective, I think it also makes sense to not zero the folio when changing faultiblity from private to shared: - If guest is sharing some data with host, you've wiped the data and guest has to copy again. - Or, if SEV/TDX enforces that page is zero'd between transitions, Linux has duplicated the work that trusted entity has already done. Fuad and I can help add some details for the conversion. Hopefully we can figure out some of the plan at plumbers this week. Thanks, Elliot > > Signed-off-by: Ackerley Tng <ackerleytng@google.com> > > --- > virt/kvm/guest_memfd.c | 131 ++++++++++++++++++++++++++++++++++++++++- > virt/kvm/kvm_main.c | 2 + > virt/kvm/kvm_mm.h | 7 +++ > 3 files changed, 139 insertions(+), 1 deletion(-) > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c > index 110c4bbb004b..fb292e542381 100644 > --- a/virt/kvm/guest_memfd.c > +++ b/virt/kvm/guest_memfd.c > @@ -129,13 +129,29 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo > } > > /** > - * Use the uptodate flag to indicate that the folio is prepared for KVM's usage. > + * Use folio's up-to-date flag to indicate that this folio is prepared for usage > + * by the guest. > + * > + * This flag can be used whether the folio is prepared for PRIVATE or SHARED > + * usage. > */ > static inline void kvm_gmem_mark_prepared(struct folio *folio) > { > folio_mark_uptodate(folio); > } > > +/** > + * Use folio's up-to-date flag to indicate that this folio is not yet prepared for > + * usage by the guest. > + * > + * This flag can be used whether the folio is prepared for PRIVATE or SHARED > + * usage. > + */ > +static inline void kvm_gmem_clear_prepared(struct folio *folio) > +{ > + folio_clear_uptodate(folio); > +} > + > /* > * Process @folio, which contains @gfn, so that the guest can use it. > * The folio must be locked and the gfn must be contained in @slot. > @@ -148,6 +164,12 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot, > pgoff_t index; > int r; > > + /* > + * Defensively zero folio to avoid leaking kernel memory in > + * uninitialized pages. This is important since pages can now be mapped > + * into userspace, where hardware (e.g. TDX) won't be clearing those > + * pages. > + */ > if (folio_test_hugetlb(folio)) { > folio_zero_user(folio, folio->index << PAGE_SHIFT); > } else { > @@ -1017,6 +1039,7 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) > { > struct inode *inode; > struct folio *folio; > + bool is_prepared; > > inode = file_inode(vmf->vma->vm_file); > if (!kvm_gmem_is_faultable(inode, vmf->pgoff)) > @@ -1026,6 +1049,31 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf) > if (!folio) > return VM_FAULT_SIGBUS; > > + is_prepared = folio_test_uptodate(folio); > + if (!is_prepared) { > + unsigned long nr_pages; > + unsigned long i; > + > + if (folio_test_hugetlb(folio)) { > + folio_zero_user(folio, folio->index << PAGE_SHIFT); > + } else { > + /* > + * Defensively zero folio to avoid leaking kernel memory in > + * uninitialized pages. This is important since pages can now be > + * mapped into userspace, where hardware (e.g. TDX) won't be > + * clearing those pages. > + * > + * Will probably need a version of kvm_gmem_prepare_folio() to > + * prepare the page for SHARED use. > + */ > + nr_pages = folio_nr_pages(folio); > + for (i = 0; i < nr_pages; i++) > + clear_highpage(folio_page(folio, i)); > + } > + > + kvm_gmem_mark_prepared(folio); > + } > + > vmf->page = folio_file_page(folio, vmf->pgoff); > return VM_FAULT_LOCKED; > } > @@ -1593,6 +1641,87 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long > } > EXPORT_SYMBOL_GPL(kvm_gmem_populate); > > +static void kvm_gmem_clear_prepared_range(struct inode *inode, pgoff_t start, > + pgoff_t end) > +{ > + pgoff_t index; > + > + filemap_invalidate_lock_shared(inode->i_mapping); > + > + /* TODO: replace iteration with filemap_get_folios() for efficiency. */ > + for (index = start; index < end;) { > + struct folio *folio; > + > + /* Don't use kvm_gmem_get_folio to avoid allocating */ > + folio = filemap_lock_folio(inode->i_mapping, index); > + if (IS_ERR(folio)) { > + ++index; > + continue; > + } > + > + kvm_gmem_clear_prepared(folio); > + > + index = folio_next_index(folio); > + folio_unlock(folio); > + folio_put(folio); > + } > + > + filemap_invalidate_unlock_shared(inode->i_mapping); > +} > + > +/** > + * Clear the prepared flag for all folios in gfn range [@start, @end) in memslot > + * @slot. > + */ > +static void kvm_gmem_clear_prepared_slot(struct kvm_memory_slot *slot, gfn_t start, > + gfn_t end) > +{ > + pgoff_t start_offset; > + pgoff_t end_offset; > + struct file *file; > + > + file = kvm_gmem_get_file(slot); > + if (!file) > + return; > + > + start_offset = start - slot->base_gfn + slot->gmem.pgoff; > + end_offset = end - slot->base_gfn + slot->gmem.pgoff; > + > + kvm_gmem_clear_prepared_range(file_inode(file), start_offset, end_offset); > + > + fput(file); > +} > + > +/** > + * Clear the prepared flag for all folios for any slot in gfn range > + * [@start, @end) in @kvm. > + */ > +void kvm_gmem_clear_prepared_vm(struct kvm *kvm, gfn_t start, gfn_t end) > +{ > + int i; > + > + lockdep_assert_held(&kvm->slots_lock); > + > + for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { > + struct kvm_memslot_iter iter; > + struct kvm_memslots *slots; > + > + slots = __kvm_memslots(kvm, i); > + kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) { > + struct kvm_memory_slot *slot; > + gfn_t gfn_start; > + gfn_t gfn_end; > + > + slot = iter.slot; > + gfn_start = max(start, slot->base_gfn); > + gfn_end = min(end, slot->base_gfn + slot->npages); > + > + if (iter.slot->flags & KVM_MEM_GUEST_MEMFD) > + kvm_gmem_clear_prepared_slot(iter.slot, gfn_start, gfn_end); > + } > + } > +} > + > /** > * Returns true if pages in range [@start, @end) in inode @inode have no > * userspace mappings. > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index 1a7bbcc31b7e..255d27df7f5c 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -2565,6 +2565,8 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end, > KVM_BUG_ON(r, kvm); > } > > + kvm_gmem_clear_prepared_vm(kvm, start, end); > + > kvm_handle_gfn_range(kvm, &post_set_range); > > out_unlock: > diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h > index d8ff2b380d0e..25fd0d9f66cc 100644 > --- a/virt/kvm/kvm_mm.h > +++ b/virt/kvm/kvm_mm.h > @@ -43,6 +43,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot, > void kvm_gmem_unbind(struct kvm_memory_slot *slot); > int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start, gfn_t end, > unsigned long attrs); > +void kvm_gmem_clear_prepared_vm(struct kvm *kvm, gfn_t start, gfn_t end); > #else > static inline void kvm_gmem_init(struct module *module) > { > @@ -68,6 +69,12 @@ static inline int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start, > return 0; > } > > +static inline void kvm_gmem_clear_prepared_slots(struct kvm *kvm, > + gfn_t start, gfn_t end) > +{ > + WARN_ON_ONCE(1); > +} > + > #endif /* CONFIG_KVM_PRIVATE_MEM */ > > #endif /* __KVM_MM_H__ */ > -- > 2.46.0.598.g6f2099f65c-goog >
Elliot Berman <quic_eberman@quicinc.com> writes: > On Tue, Sep 10, 2024 at 11:44:01PM +0000, Ackerley Tng wrote: >> Since guest_memfd now supports mmap(), folios have to be prepared >> before they are faulted into userspace. >> >> When memory attributes are switched between shared and private, the >> up-to-date flags will be cleared. >> >> Use the folio's up-to-date flag to indicate being ready for the guest >> usage and can be used to mark whether the folio is ready for shared OR >> private use. > > Clearing the up-to-date flag also means that the page gets zero'd out > whenever it transitions between shared and private (either direction). > pKVM (Android) hypervisor policy can allow in-place conversion between > shared/private. > > I believe the important thing is that sev_gmem_prepare() needs to be > called prior to giving page to guest. In my series, I had made a > ->prepare_inaccessible() callback where KVM would only do this part. > When transitioning to inaccessible, only that callback would be made, > besides the bookkeeping. The folio zeroing happens once when allocating > the folio if the folio is initially accessible (faultable). > > From x86 CoCo perspective, I think it also makes sense to not zero > the folio when changing faultiblity from private to shared: > - If guest is sharing some data with host, you've wiped the data and > guest has to copy again. > - Or, if SEV/TDX enforces that page is zero'd between transitions, > Linux has duplicated the work that trusted entity has already done. > > Fuad and I can help add some details for the conversion. Hopefully we > can figure out some of the plan at plumbers this week. Zeroing the page prevents leaking host data (see function docstring for kvm_gmem_prepare_folio() introduced in [1]), so we definitely don't want to introduce a kernel data leak bug here. In-place conversion does require preservation of data, so for conversions, shall we zero depending on VM type? + Gunyah: don't zero since ->prepare_inaccessible() is a no-op + pKVM: don't zero + TDX: don't zero + SEV: AMD Architecture Programmers Manual 7.10.6 says there is no automatic encryption and implies no zeroing, hence perform zeroing + KVM_X86_SW_PROTECTED_VM: Doesn't have a formal definition so I guess we could require zeroing on transition? This way, the uptodate flag means that it has been prepared (as in sev_gmem_prepare()), and zeroed if required by VM type. Regarding flushing the dcache/tlb in your other question [2], if we don't use folio_zero_user(), can we relying on unmapping within core-mm to flush after shared use, and unmapping within KVM To flush after private use? Or should flush_dcache_folio() be explicitly called on kvm_gmem_fault()? clear_highpage(), used in the non-hugetlb (original) path, doesn't flush the dcache. Was that intended? > Thanks, > Elliot > >> >> <snip> [1] https://lore.kernel.org/all/20240726185157.72821-8-pbonzini@redhat.com/ [2] https://lore.kernel.org/all/diqz34ldszp3.fsf@ackerleytng-ctop.c.googlers.com/
Hi Ackerley, On Thu, 2024-10-03 at 22:32 +0100, Ackerley Tng wrote: > Elliot Berman <quic_eberman@quicinc.com> writes: > >> On Tue, Sep 10, 2024 at 11:44:01PM +0000, Ackerley Tng wrote: >>> Since guest_memfd now supports mmap(), folios have to be prepared >>> before they are faulted into userspace. >>> >>> When memory attributes are switched between shared and private, the >>> up-to-date flags will be cleared. >>> >>> Use the folio's up-to-date flag to indicate being ready for the guest >>> usage and can be used to mark whether the folio is ready for shared OR >>> private use. >> >> Clearing the up-to-date flag also means that the page gets zero'd out >> whenever it transitions between shared and private (either direction). >> pKVM (Android) hypervisor policy can allow in-place conversion between >> shared/private. >> >> I believe the important thing is that sev_gmem_prepare() needs to be >> called prior to giving page to guest. In my series, I had made a >> ->prepare_inaccessible() callback where KVM would only do this part. >> When transitioning to inaccessible, only that callback would be made, >> besides the bookkeeping. The folio zeroing happens once when allocating >> the folio if the folio is initially accessible (faultable). >> >> From x86 CoCo perspective, I think it also makes sense to not zero >> the folio when changing faultiblity from private to shared: >> - If guest is sharing some data with host, you've wiped the data and >> guest has to copy again. >> - Or, if SEV/TDX enforces that page is zero'd between transitions, >> Linux has duplicated the work that trusted entity has already done. >> >> Fuad and I can help add some details for the conversion. Hopefully we >> can figure out some of the plan at plumbers this week. > > Zeroing the page prevents leaking host data (see function docstring for > kvm_gmem_prepare_folio() introduced in [1]), so we definitely don't want > to introduce a kernel data leak bug here. > > In-place conversion does require preservation of data, so for > conversions, shall we zero depending on VM type? > > + Gunyah: don't zero since ->prepare_inaccessible() is a no-op > + pKVM: don't zero > + TDX: don't zero > + SEV: AMD Architecture Programmers Manual 7.10.6 says there is no > automatic encryption and implies no zeroing, hence perform zeroing > + KVM_X86_SW_PROTECTED_VM: Doesn't have a formal definition so I guess > we could require zeroing on transition? Maybe for KVM_X86_SW_PROTECTED_VM we could make zero-ing configurable via some CREATE_GUEST_MEMFD flag, instead of forcing one specific behavior. For the "non-CoCo with direct map entries removed" VMs that we at AWS are going for, we'd like a VM type with host-controlled in-place conversions which doesn't zero on transitions, so if KVM_X86_SW_PROTECTED_VM ends up zeroing, we'd need to add another new VM type for that. Somewhat related sidenote: For VMs that allow inplace conversions and do not zero, we do not need to zap the stage-2 mappings on memory attribute changes, right? > This way, the uptodate flag means that it has been prepared (as in > sev_gmem_prepare()), and zeroed if required by VM type. > > Regarding flushing the dcache/tlb in your other question [2], if we > don't use folio_zero_user(), can we relying on unmapping within core-mm > to flush after shared use, and unmapping within KVM To flush after > private use? > > Or should flush_dcache_folio() be explicitly called on kvm_gmem_fault()? > > clear_highpage(), used in the non-hugetlb (original) path, doesn't flush > the dcache. Was that intended? > >> Thanks, >> Elliot >> >>> >>> <snip> > > [1] https://lore.kernel.org/all/20240726185157.72821-8-pbonzini@redhat.com/ > [2] https://lore.kernel.org/all/diqz34ldszp3.fsf@ackerleytng-ctop.c.googlers.com/ Best, Patrick
Patrick Roy <roypat@amazon.co.uk> writes: > Hi Ackerley, > > On Thu, 2024-10-03 at 22:32 +0100, Ackerley Tng wrote: >> Elliot Berman <quic_eberman@quicinc.com> writes: >> >>> On Tue, Sep 10, 2024 at 11:44:01PM +0000, Ackerley Tng wrote: >>>> Since guest_memfd now supports mmap(), folios have to be prepared >>>> before they are faulted into userspace. >>>> >>>> When memory attributes are switched between shared and private, the >>>> up-to-date flags will be cleared. >>>> >>>> Use the folio's up-to-date flag to indicate being ready for the guest >>>> usage and can be used to mark whether the folio is ready for shared OR >>>> private use. >>> >>> Clearing the up-to-date flag also means that the page gets zero'd out >>> whenever it transitions between shared and private (either direction). >>> pKVM (Android) hypervisor policy can allow in-place conversion between >>> shared/private. >>> >>> I believe the important thing is that sev_gmem_prepare() needs to be >>> called prior to giving page to guest. In my series, I had made a >>> ->prepare_inaccessible() callback where KVM would only do this part. >>> When transitioning to inaccessible, only that callback would be made, >>> besides the bookkeeping. The folio zeroing happens once when allocating >>> the folio if the folio is initially accessible (faultable). >>> >>> From x86 CoCo perspective, I think it also makes sense to not zero >>> the folio when changing faultiblity from private to shared: >>> - If guest is sharing some data with host, you've wiped the data and >>> guest has to copy again. >>> - Or, if SEV/TDX enforces that page is zero'd between transitions, >>> Linux has duplicated the work that trusted entity has already done. >>> >>> Fuad and I can help add some details for the conversion. Hopefully we >>> can figure out some of the plan at plumbers this week. >> >> Zeroing the page prevents leaking host data (see function docstring for >> kvm_gmem_prepare_folio() introduced in [1]), so we definitely don't want >> to introduce a kernel data leak bug here. >> >> In-place conversion does require preservation of data, so for >> conversions, shall we zero depending on VM type? >> >> + Gunyah: don't zero since ->prepare_inaccessible() is a no-op >> + pKVM: don't zero >> + TDX: don't zero >> + SEV: AMD Architecture Programmers Manual 7.10.6 says there is no >> automatic encryption and implies no zeroing, hence perform zeroing >> + KVM_X86_SW_PROTECTED_VM: Doesn't have a formal definition so I guess >> we could require zeroing on transition? > > Maybe for KVM_X86_SW_PROTECTED_VM we could make zero-ing configurable > via some CREATE_GUEST_MEMFD flag, instead of forcing one specific > behavior. Sounds good to me, I can set up a flag in the next revision. > For the "non-CoCo with direct map entries removed" VMs that we at AWS > are going for, we'd like a VM type with host-controlled in-place > conversions which doesn't zero on transitions, so if > KVM_X86_SW_PROTECTED_VM ends up zeroing, we'd need to add another new VM > type for that. > > Somewhat related sidenote: For VMs that allow inplace conversions and do > not zero, we do not need to zap the stage-2 mappings on memory attribute > changes, right? > Here are some reasons for zapping I can think of: 1. When private pages are split/merged, zapping the stage-2 mappings on memory attribute changes allows the private pages to be re-faulted by KVM at smaller/larger granularity. 2. The rationale described here https://elixir.bootlin.com/linux/v6.11.2/source/arch/x86/kvm/mmu/mmu.c#L7482 ("Zapping SPTEs in this case ensures KVM will reassess whether or not a hugepage can be used for affected ranges.") probably refers to the existing implementation, when a different set of physical pages is used to back shared and private memory. When the same set of physical pages is used for both shared and private memory, then IIUC this rationale does not apply. 3. There's another rationale for zapping https://elixir.bootlin.com/linux/v6.11.2/source/virt/kvm/kvm_main.c#L2494 to do with read vs write mappings here. I don't fully understand this, does this rationale still apply? 4. Is zapping required if the pages get removed/added to kernel direct map? >> This way, the uptodate flag means that it has been prepared (as in >> sev_gmem_prepare()), and zeroed if required by VM type. >> >> Regarding flushing the dcache/tlb in your other question [2], if we >> don't use folio_zero_user(), can we relying on unmapping within core-mm >> to flush after shared use, and unmapping within KVM To flush after >> private use? >> >> Or should flush_dcache_folio() be explicitly called on kvm_gmem_fault()? >> >> clear_highpage(), used in the non-hugetlb (original) path, doesn't flush >> the dcache. Was that intended? >> >>> Thanks, >>> Elliot >>> >>>> >>>> <snip> >> >> [1] https://lore.kernel.org/all/20240726185157.72821-8-pbonzini@redhat.com/ >> [2] https://lore.kernel.org/all/diqz34ldszp3.fsf@ackerleytng-ctop.c.googlers.com/ > > Best, > Patrick
On Tue, Oct 08, 2024, Ackerley Tng wrote: > Patrick Roy <roypat@amazon.co.uk> writes: > > For the "non-CoCo with direct map entries removed" VMs that we at AWS > > are going for, we'd like a VM type with host-controlled in-place > > conversions which doesn't zero on transitions, Hmm, your use case shouldn't need conversions _for KVM_, as there's no need for KVM to care if userspace or the guest _wants_ a page to be shared vs. private. Userspace is fully trusted to manage things; KVM simply reacts to the current state of things. And more importantly, whether or not the direct map is zapped needs to be a property of the guest_memfd inode, i.e. can't be associated with a struct kvm. I forget who got volunteered to do the work, but we're going to need similar functionality for tracking the state of individual pages in a huge folio, as folio_mark_uptodate() is too coarse-grained. I.e. at some point, I expect that guest_memfd will make it easy-ish to determine whether or not the direct map has been obliterated. The shared vs. private attributes tracking in KVM is still needed (I think), as it communicates what userspace _wants_, whereas he guest_memfd machinery will track what the state _is_. > > so if KVM_X86_SW_PROTECTED_VM ends up zeroing, we'd need to add another new > > VM type for that. Maybe we should sneak in a s/KVM_X86_SW_PROTECTED_VM/KVM_X86_SW_HARDENED_VM rename? The original thought behind "software protected VM" was to do a slow build of something akin to pKVM, but realistically I don't think that idea is going anywhere. Alternatively, depending on how KVM accesses guest memory that's been removed from the direct map, another solution would be to allow "regular" VMs to bind memslots to guest_memfd, i.e. if the non-CoCo use case needs/wnats to bind all memory to guest_memfd, not just "private" mappings. That's probably the biggest topic of discussion: how do we want to allow mapping guest_memfd into the guest, without direct map entries, but while still allowing KVM to access guest memory as needed, e.g. for shadow paging. One approach is your RFC, where KVM maps guest_memfd pfns on-demand. Another (slightly crazy) approach would be use protection keys to provide the security properties that you want, while giving KVM (and userspace) a quick-and-easy override to access guest memory. 1. mmap() guest_memfd into userpace with RW protections 2. Configure PKRU to make guest_memfd memory inaccessible by default 3. Swizzle PKRU on-demand when intentionally accessing guest memory It's essentially the same idea as SMAP+STAC/CLAC, just applied to guest memory instead of to usersepace memory. The benefit of the PKRU approach is that there are no PTE modifications, and thus no TLB flushes, and only the CPU that is access guest memory gains temporary access. The big downside is that it would be limited to modern hardware, but that might be acceptable, especially if it simplifies KVM's implementation. > > Somewhat related sidenote: For VMs that allow inplace conversions and do > > not zero, we do not need to zap the stage-2 mappings on memory attribute > > changes, right? See above. I don't think conversions by toggling the shared/private flag in KVM's memory attributes is the right fit for your use case.
On Tue, 2024-10-08 at 20:56 +0100, Sean Christopherson wrote: > On Tue, Oct 08, 2024, Ackerley Tng wrote: >> Patrick Roy <roypat@amazon.co.uk> writes: >>> For the "non-CoCo with direct map entries removed" VMs that we at AWS >>> are going for, we'd like a VM type with host-controlled in-place >>> conversions which doesn't zero on transitions, > > Hmm, your use case shouldn't need conversions _for KVM_, as there's no need for > KVM to care if userspace or the guest _wants_ a page to be shared vs. private. > Userspace is fully trusted to manage things; KVM simply reacts to the current > state of things. > > And more importantly, whether or not the direct map is zapped needs to be a > property of the guest_memfd inode, i.e. can't be associated with a struct kvm. > I forget who got volunteered to do the work, I think me? At least we talked about it briefly > but we're going to need similar > functionality for tracking the state of individual pages in a huge folio, as > folio_mark_uptodate() is too coarse-grained. I.e. at some point, I expect that > guest_memfd will make it easy-ish to determine whether or not the direct map has > been obliterated. > > The shared vs. private attributes tracking in KVM is still needed (I think), as > it communicates what userspace _wants_, whereas he guest_memfd machinery will > track what the state _is_. If I'm understanding this patch series correctly, the approach taken here is to force the KVM memory attributes and the internal guest_memfd state to be in-sync, because the VMA from mmap()ing guest_memfd is reflected back into the userspace_addr of the memslot. So, to me, in this world, "direct map zapped iff kvm_has_mem_attributes(KVM_MEMORY_ATTRIBUTES_PRIVATE)", with memory attribute changes forcing the corresponding gmem state change. That's why I was talking about conversions above. I've played around with this locally, and since KVM seems to generally use copy_from_user and friends to access the userspace_addr VMA, (aka private mem that's reflected back into memslots here), with this things like MMIO emulation can be oblivious to gmem's existence, since copy_from_user and co don't require GUP or presence of direct map entries (well, "oblivious" in the sense that things like kvm_read_guest currently ignore memory attributes and unconditionally access userspace_addr, which I suppose is not really wanted for VMs where userspace_addr and guest_memfd aren't short-circuited like this). The exception is kvm_clock, where the pv_time page would need to be explicitly converted to shared to restore the direct map entry, although I think we could just let userspace deal with making sure this page is shared (and then, if gmem supports GUP on shared memory, even the gfn_to_pfn_caches could work without gmem knowledge. Without GUP, we'd still need a tiny hack in the uhva->pfn translation somewhere to handle gmem vmas, but iirc you did mention that having kvm-clock be special might be fine). I guess it does come down to what you note below, answering the question of "how does KVM internally access guest_memfd for non-CoCo VMs". Is there any way we can make uaccesses like above work? I've finally gotten around to re-running some performance benchmarks of my on-demand reinsertion patches with all the needed TLB flushes added, and my fio benchmark on a virtio-blk device suffers a ~50% throughput regression, which does not necessarily spark joy. And I think James H. mentioned at LPC that making the userfault stuff work with my patches would be quite hard. All this in addition to you also not necessarily sounding too keen on it either :D >>> so if KVM_X86_SW_PROTECTED_VM ends up zeroing, we'd need to add another new >>> VM type for that. > > Maybe we should sneak in a s/KVM_X86_SW_PROTECTED_VM/KVM_X86_SW_HARDENED_VM rename? > The original thought behind "software protected VM" was to do a slow build of > something akin to pKVM, but realistically I don't think that idea is going anywhere. Ah, admittedly I've thought of KVM_X86_SW_PROTECTED_VM as a bit of a playground where various configurations other VM types enforce can be mixed and matched (e.g. zero on conversions yes/no, direct map removal yes/no) so more of a KVM_X86_GMEM_VM, but am happy to update my understanding :) > Alternatively, depending on how KVM accesses guest memory that's been removed from > the direct map, another solution would be to allow "regular" VMs to bind memslots > to guest_memfd, i.e. if the non-CoCo use case needs/wnats to bind all memory to > guest_memfd, not just "private" mappings. > > That's probably the biggest topic of discussion: how do we want to allow mapping > guest_memfd into the guest, without direct map entries, but while still allowing > KVM to access guest memory as needed, e.g. for shadow paging. One approach is > your RFC, where KVM maps guest_memfd pfns on-demand. > > Another (slightly crazy) approach would be use protection keys to provide the > security properties that you want, while giving KVM (and userspace) a quick-and-easy > override to access guest memory. > > 1. mmap() guest_memfd into userpace with RW protections > 2. Configure PKRU to make guest_memfd memory inaccessible by default > 3. Swizzle PKRU on-demand when intentionally accessing guest memory > > It's essentially the same idea as SMAP+STAC/CLAC, just applied to guest memory > instead of to usersepace memory. > > The benefit of the PKRU approach is that there are no PTE modifications, and thus > no TLB flushes, and only the CPU that is access guest memory gains temporary > access. The big downside is that it would be limited to modern hardware, but > that might be acceptable, especially if it simplifies KVM's implementation. Mh, but we only have 16 protection keys, so we cannot give each VM a unique one. And if all guest memory shares the same protection key, then during the on-demand swizzling the CPU would get access to _all_ guest memory on the host, which "feels" scary. What do you think, @Derek? Does ARM have something equivalent, btw? >>> Somewhat related sidenote: For VMs that allow inplace conversions and do >>> not zero, we do not need to zap the stage-2 mappings on memory attribute >>> changes, right? > > See above. I don't think conversions by toggling the shared/private flag in > KVM's memory attributes is the right fit for your use case.
Patrick Roy <roypat@amazon.co.uk> writes: > On Tue, 2024-10-08 at 20:56 +0100, Sean Christopherson wrote: >> On Tue, Oct 08, 2024, Ackerley Tng wrote: >>> Patrick Roy <roypat@amazon.co.uk> writes: >>>> For the "non-CoCo with direct map entries removed" VMs that we at AWS >>>> are going for, we'd like a VM type with host-controlled in-place >>>> conversions which doesn't zero on transitions, >> >> Hmm, your use case shouldn't need conversions _for KVM_, as there's no need for >> KVM to care if userspace or the guest _wants_ a page to be shared vs. private. >> Userspace is fully trusted to manage things; KVM simply reacts to the current >> state of things. >> >> And more importantly, whether or not the direct map is zapped needs to be a >> property of the guest_memfd inode, i.e. can't be associated with a struct kvm. >> I forget who got volunteered to do the work, > > I think me? At least we talked about it briefly > >> but we're going to need similar >> functionality for tracking the state of individual pages in a huge folio, as >> folio_mark_uptodate() is too coarse-grained. I.e. at some point, I expect that >> guest_memfd will make it easy-ish to determine whether or not the direct map has >> been obliterated. >> >> The shared vs. private attributes tracking in KVM is still needed (I think), as >> it communicates what userspace _wants_, whereas he guest_memfd machinery will >> track what the state _is_. > > If I'm understanding this patch series correctly, the approach taken > here is to force the KVM memory attributes and the internal guest_memfd > state to be in-sync, because the VMA from mmap()ing guest_memfd is > reflected back into the userspace_addr of the memslot. In this patch series, we're also using guest_memfd state (faultability xarray) to prevent any future faults before checking that there are no mappings. Further explanation at [1]. Reason (a) at [1] is what Sean describes above to be what userspace _wants_ vs what the state _is_. > So, to me, in > this world, "direct map zapped iff > kvm_has_mem_attributes(KVM_MEMORY_ATTRIBUTES_PRIVATE)", with memory > attribute changes forcing the corresponding gmem state change. That's > why I was talking about conversions above. I think if we do continue to have state in guest_memfd, then direct map removal should be based on guest_memfd's state, rather than KVM_MEMORY_ATTRIBUTE_PRIVATE in mem_attr_array. > I've played around with this locally, and since KVM seems to generally > use copy_from_user and friends to access the userspace_addr VMA, (aka > private mem that's reflected back into memslots here), with this things > like MMIO emulation can be oblivious to gmem's existence, since > copy_from_user and co don't require GUP or presence of direct map > entries (well, "oblivious" in the sense that things like kvm_read_guest > currently ignore memory attributes and unconditionally access > userspace_addr, which I suppose is not really wanted for VMs where > userspace_addr and guest_memfd aren't short-circuited like this). The > exception is kvm_clock, where the pv_time page would need to be > explicitly converted to shared to restore the direct map entry, although > I think we could just let userspace deal with making sure this page is > shared (and then, if gmem supports GUP on shared memory, even the > gfn_to_pfn_caches could work without gmem knowledge. Without GUP, we'd > still need a tiny hack in the uhva->pfn translation somewhere to handle > gmem vmas, but iirc you did mention that having kvm-clock be special > might be fine). > > I guess it does come down to what you note below, answering the question > of "how does KVM internally access guest_memfd for non-CoCo VMs". Is > there any way we can make uaccesses like above work? I've finally gotten > around to re-running some performance benchmarks of my on-demand > reinsertion patches with all the needed TLB flushes added, and my fio > benchmark on a virtio-blk device suffers a ~50% throughput regression, > which does not necessarily spark joy. And I think James H. mentioned at > LPC that making the userfault stuff work with my patches would be quite > hard. All this in addition to you also not necessarily sounding too keen > on it either :D > >>>> so if KVM_X86_SW_PROTECTED_VM ends up zeroing, we'd need to add another new >>>> VM type for that. >> >> Maybe we should sneak in a s/KVM_X86_SW_PROTECTED_VM/KVM_X86_SW_HARDENED_VM rename? >> The original thought behind "software protected VM" was to do a slow build of >> something akin to pKVM, but realistically I don't think that idea is going anywhere. > > Ah, admittedly I've thought of KVM_X86_SW_PROTECTED_VM as a bit of a > playground where various configurations other VM types enforce can be > mixed and matched (e.g. zero on conversions yes/no, direct map removal > yes/no) so more of a KVM_X86_GMEM_VM, but am happy to update my > understanding :) > Given the different axes of possible configurations for guest_memfd (zero on conversion, direct map removal), I think it's better to let userspace choose, than to enumerate the combinations in VM types. Independently of whether to use a flag or VM type to configure guest_memfd, the "zeroed" state has to be stored somewhere. For folios to at least be zeroed once, presence in the filemap could indicated "zeroed". Presence in the filemap may be awkward to use as an indication of "zeroed" for the conversion case. What else can we use to store "zeroed"? Suggestions: 1. Since "prepared" already took the dirty bit on the folio, "zeroed" can use the checked bit on the folio. [2] indicates that it is for filesystems, which sounds like guest_memfd :) 2. folio->private (which we may already need to use) 3. Another xarray >> <snip> [1] https://lore.kernel.org/all/diqz1q0qtqnd.fsf@ackerleytng-ctop.c.googlers.com/T/#ma6f828d7a50c4de8a2f829a16c9bb458b53d8f3f [2] https://elixir.bootlin.com/linux/v6.11.4/source/include/linux/page-flags.h#L147
On Fri, 2024-10-18 at 00:16 +0100, Ackerley Tng wrote: > Patrick Roy <roypat@amazon.co.uk> writes: > >> On Tue, 2024-10-08 at 20:56 +0100, Sean Christopherson wrote: >>> On Tue, Oct 08, 2024, Ackerley Tng wrote: >>>> Patrick Roy <roypat@amazon.co.uk> writes: >>>>> For the "non-CoCo with direct map entries removed" VMs that we at AWS >>>>> are going for, we'd like a VM type with host-controlled in-place >>>>> conversions which doesn't zero on transitions, >>> >>> Hmm, your use case shouldn't need conversions _for KVM_, as there's no need for >>> KVM to care if userspace or the guest _wants_ a page to be shared vs. private. >>> Userspace is fully trusted to manage things; KVM simply reacts to the current >>> state of things. >>> >>> And more importantly, whether or not the direct map is zapped needs to be a >>> property of the guest_memfd inode, i.e. can't be associated with a struct kvm. >>> I forget who got volunteered to do the work, >> >> I think me? At least we talked about it briefly >> >>> but we're going to need similar >>> functionality for tracking the state of individual pages in a huge folio, as >>> folio_mark_uptodate() is too coarse-grained. I.e. at some point, I expect that >>> guest_memfd will make it easy-ish to determine whether or not the direct map has >>> been obliterated. >>> >>> The shared vs. private attributes tracking in KVM is still needed (I think), as >>> it communicates what userspace _wants_, whereas he guest_memfd machinery will >>> track what the state _is_. >> >> If I'm understanding this patch series correctly, the approach taken >> here is to force the KVM memory attributes and the internal guest_memfd >> state to be in-sync, because the VMA from mmap()ing guest_memfd is >> reflected back into the userspace_addr of the memslot. > > In this patch series, we're also using guest_memfd state (faultability > xarray) to prevent any future faults before checking that there are no > mappings. Further explanation at [1]. > > Reason (a) at [1] is what Sean describes above to be what userspace > _wants_ vs what the state _is_. Ah, I was missing that detail about faultability being disabled, yet mem_attrs not being updated until all pins are actually gone. Thanks! Mh, I'm probably not seeing it because of my lack with CoCo setups, but how would pKVM not trusting userspace about conversions cause mem_attrs and faultability go out of sync? Or generally, if the guest and userspace have different ideas about what is shared, and userspace's idea is stored in mem_attrs (or rather, the part where they can agree is stored in mem_attrs?), where do we store the guest's view of it? Guest page tables? >> So, to me, in >> this world, "direct map zapped iff >> kvm_has_mem_attributes(KVM_MEMORY_ATTRIBUTES_PRIVATE)", with memory >> attribute changes forcing the corresponding gmem state change. That's >> why I was talking about conversions above. > > I think if we do continue to have state in guest_memfd, then direct map > removal should be based on guest_memfd's state, rather than > KVM_MEMORY_ATTRIBUTE_PRIVATE in mem_attr_array. I am not trying to argue against tracking it in guest_memfd, I'm just wondering if mem attributes and direct map state would ever disagree. But probably that's also just because of my confusion above :) >> I've played around with this locally, and since KVM seems to generally >> use copy_from_user and friends to access the userspace_addr VMA, (aka >> private mem that's reflected back into memslots here), with this things >> like MMIO emulation can be oblivious to gmem's existence, since >> copy_from_user and co don't require GUP or presence of direct map >> entries (well, "oblivious" in the sense that things like kvm_read_guest >> currently ignore memory attributes and unconditionally access >> userspace_addr, which I suppose is not really wanted for VMs where >> userspace_addr and guest_memfd aren't short-circuited like this). The >> exception is kvm_clock, where the pv_time page would need to be >> explicitly converted to shared to restore the direct map entry, although >> I think we could just let userspace deal with making sure this page is >> shared (and then, if gmem supports GUP on shared memory, even the >> gfn_to_pfn_caches could work without gmem knowledge. Without GUP, we'd >> still need a tiny hack in the uhva->pfn translation somewhere to handle >> gmem vmas, but iirc you did mention that having kvm-clock be special >> might be fine). >> >> I guess it does come down to what you note below, answering the question >> of "how does KVM internally access guest_memfd for non-CoCo VMs". Is >> there any way we can make uaccesses like above work? I've finally gotten >> around to re-running some performance benchmarks of my on-demand >> reinsertion patches with all the needed TLB flushes added, and my fio >> benchmark on a virtio-blk device suffers a ~50% throughput regression, >> which does not necessarily spark joy. And I think James H. mentioned at >> LPC that making the userfault stuff work with my patches would be quite >> hard. All this in addition to you also not necessarily sounding too keen >> on it either :D >> >>>>> so if KVM_X86_SW_PROTECTED_VM ends up zeroing, we'd need to add another new >>>>> VM type for that. >>> >>> Maybe we should sneak in a s/KVM_X86_SW_PROTECTED_VM/KVM_X86_SW_HARDENED_VM rename? >>> The original thought behind "software protected VM" was to do a slow build of >>> something akin to pKVM, but realistically I don't think that idea is going anywhere. >> >> Ah, admittedly I've thought of KVM_X86_SW_PROTECTED_VM as a bit of a >> playground where various configurations other VM types enforce can be >> mixed and matched (e.g. zero on conversions yes/no, direct map removal >> yes/no) so more of a KVM_X86_GMEM_VM, but am happy to update my >> understanding :) >> > > Given the different axes of possible configurations for guest_memfd > (zero on conversion, direct map removal), I think it's better to let > userspace choose, than to enumerate the combinations in VM types. > > Independently of whether to use a flag or VM type to configure > guest_memfd, the "zeroed" state has to be stored somewhere. > > For folios to at least be zeroed once, presence in the filemap could > indicated "zeroed". > > Presence in the filemap may be awkward to use as an indication of > "zeroed" for the conversion case. > > What else can we use to store "zeroed"? Suggestions: > > 1. Since "prepared" already took the dirty bit on the folio, "zeroed" > can use the checked bit on the folio. [2] indicates that it is for > filesystems, which sounds like guest_memfd :) > 2. folio->private (which we may already need to use) > 3. Another xarray > >>> <snip> > > [1] https://lore.kernel.org/all/diqz1q0qtqnd.fsf@ackerleytng-ctop.c.googlers.com/T/#ma6f828d7a50c4de8a2f829a16c9bb458b53d8f3f > [2] https://elixir.bootlin.com/linux/v6.11.4/source/include/linux/page-flags.h#L147
On 2024-10-10 at 16:21+0000 Patrick Roy wrote: > On Tue, 2024-10-08 at 20:56 +0100, Sean Christopherson wrote: > > Another (slightly crazy) approach would be use protection keys to provide the > > security properties that you want, while giving KVM (and userspace) a quick-and-easy > > override to access guest memory. > > > > 1. mmap() guest_memfd into userpace with RW protections > > 2. Configure PKRU to make guest_memfd memory inaccessible by default > > 3. Swizzle PKRU on-demand when intentionally accessing guest memory > > > > It's essentially the same idea as SMAP+STAC/CLAC, just applied to guest memory > > instead of to usersepace memory. > > > > The benefit of the PKRU approach is that there are no PTE modifications, and thus > > no TLB flushes, and only the CPU that is access guest memory gains temporary > > access. The big downside is that it would be limited to modern hardware, but > > that might be acceptable, especially if it simplifies KVM's implementation. > > Mh, but we only have 16 protection keys, so we cannot give each VM a > unique one. And if all guest memory shares the same protection key, then > during the on-demand swizzling the CPU would get access to _all_ guest > memory on the host, which "feels" scary. What do you think, @Derek? Yes I am concerned about this. I don't see a way to use protection keys that would ensure the host kernel cannot be tricked by one guest into speculatively accessing another guest's memory (unless we do a key per vm, which like you say severely limits how many guests you can host). > Does ARM have something equivalent, btw? Yes - Permission Overlay Extension [1]. Although even the most recent parts don't offer it. I don't see it in Neoverse V3 or Cortex-X4. Derek [1] https://lore.kernel.org/all/20240822151113.1479789-1-joey.gouly@arm.com/
On 2024-10-08 at 19:56+0000 Sean Christopherson wrote: > Another (slightly crazy) approach would be use protection keys to provide the > security properties that you want, while giving KVM (and userspace) a quick-and-easy > override to access guest memory. > > 1. mmap() guest_memfd into userpace with RW protections > 2. Configure PKRU to make guest_memfd memory inaccessible by default > 3. Swizzle PKRU on-demand when intentionally accessing guest memory > > It's essentially the same idea as SMAP+STAC/CLAC, just applied to guest memory > instead of to usersepace memory. > > The benefit of the PKRU approach is that there are no PTE modifications, and thus > no TLB flushes, and only the CPU that is access guest memory gains temporary > access. The big downside is that it would be limited to modern hardware, but > that might be acceptable, especially if it simplifies KVM's implementation. Yeah this might be worth it if it simplifies significantly. Jenkins et al. showed MPK worked for stopping in-process Spectre V1 [1]. While future hardware bugs are always possible, the host kernel would still offer better protection overall since discovery of additional Spectre approaches and gadgets in the kernel is more likely (I think it's a bigger surface area than hardware-specific MPK transient execution issues). Patrick, we talked about this a couple weeks ago and ended up focusing on within-userspace protection, but I see keys can also be used to stop kernel access like Andrew's project he mentioned during Dave's MPK session at LPC [2]. Andrew, could you share that here? It's not clear to me how reliably the kernel prevents its own access to such pages. I see a few papers that warrant more investigation: "we found multiple interfaces that Linux, by design, provides for accessing process memory that ignore PKU domains on a page." [3] "Though Connor et al. demonstrate that existing MPK protections can be bypassed by using the kernel as a confused deputy, compelling recent work indicates that MPK operations can be made secure." [4] Dave and others, if you're aware of resources clarifying how strong the boundaries are, that would be helpful. Derek [1] https://www.cs.dartmouth.edu/~sws/pubs/jas2020.pdf [2] https://www.youtube.com/watch?v=gEUeMfrNH94&t=1028s [3] https://www.usenix.org/system/files/sec20-connor.pdf [4] https://ics.uci.edu/~dabrowsa/kirth-eurosys22-pkru.pdf
On 09/10/2024 4:51 am, Manwaring, Derek wrote: > On 2024-10-08 at 19:56+0000 Sean Christopherson wrote: >> Another (slightly crazy) approach would be use protection keys to provide the >> security properties that you want, while giving KVM (and userspace) a quick-and-easy >> override to access guest memory. >> >> 1. mmap() guest_memfd into userpace with RW protections >> 2. Configure PKRU to make guest_memfd memory inaccessible by default >> 3. Swizzle PKRU on-demand when intentionally accessing guest memory >> >> It's essentially the same idea as SMAP+STAC/CLAC, just applied to guest memory >> instead of to usersepace memory. >> >> The benefit of the PKRU approach is that there are no PTE modifications, and thus >> no TLB flushes, and only the CPU that is access guest memory gains temporary >> access. The big downside is that it would be limited to modern hardware, but >> that might be acceptable, especially if it simplifies KVM's implementation. > Yeah this might be worth it if it simplifies significantly. Jenkins et > al. showed MPK worked for stopping in-process Spectre V1 [1]. While > future hardware bugs are always possible, the host kernel would still > offer better protection overall since discovery of additional Spectre > approaches and gadgets in the kernel is more likely (I think it's a > bigger surface area than hardware-specific MPK transient execution > issues). > > Patrick, we talked about this a couple weeks ago and ended up focusing > on within-userspace protection, but I see keys can also be used to stop > kernel access like Andrew's project he mentioned during Dave's MPK > session at LPC [2]. Andrew, could you share that here? This was in reference to PKS specifically (so Sapphire Rapids and later), and also for Xen but the technique is general. Allocate one supervisor key for the directmap (and other ranges wanting protecting), and configure MSR_PKS[key]=AD by default. Protection Keys were identified as being safe as a defence against Meltdown. At the time, only PKRU existed, and PKS was expected to have been less overhead than KPTI on Skylake, which was even more frustrating for those of us who'd begged for a supervisor form at the time. What's done is done. The changes needed in main code would be accessors for directmap pointers, because there needs to temporary AD-disable. This would take the form of 2x WRMSR, as opposed to a STAC/CLAC pair. An area of concern is the overhead of the WRMSRs. MSR_PKS is defined as not-architecturally-serialising, but like STAC/CLAC probably comes with model-dependent dispatch-serialising properties to prevent memory accesses executing speculatively under the wrong protection key. Also, for this strategy to be effective, you need to PKEY-tag all aliases of the memory. ~Andrew
Ackerley Tng <ackerleytng@google.com> writes: > Elliot Berman <quic_eberman@quicinc.com> writes: > >> On Tue, Sep 10, 2024 at 11:44:01PM +0000, Ackerley Tng wrote: >>> Since guest_memfd now supports mmap(), folios have to be prepared >>> before they are faulted into userspace. >>> >>> When memory attributes are switched between shared and private, the >>> up-to-date flags will be cleared. >>> >>> Use the folio's up-to-date flag to indicate being ready for the guest >>> usage and can be used to mark whether the folio is ready for shared OR >>> private use. >> >> Clearing the up-to-date flag also means that the page gets zero'd out >> whenever it transitions between shared and private (either direction). >> pKVM (Android) hypervisor policy can allow in-place conversion between >> shared/private. >> >> I believe the important thing is that sev_gmem_prepare() needs to be >> called prior to giving page to guest. In my series, I had made a >> ->prepare_inaccessible() callback where KVM would only do this part. >> When transitioning to inaccessible, only that callback would be made, >> besides the bookkeeping. The folio zeroing happens once when allocating >> the folio if the folio is initially accessible (faultable). >> >> From x86 CoCo perspective, I think it also makes sense to not zero >> the folio when changing faultiblity from private to shared: >> - If guest is sharing some data with host, you've wiped the data and >> guest has to copy again. >> - Or, if SEV/TDX enforces that page is zero'd between transitions, >> Linux has duplicated the work that trusted entity has already done. >> >> Fuad and I can help add some details for the conversion. Hopefully we >> can figure out some of the plan at plumbers this week. > > Zeroing the page prevents leaking host data (see function docstring for > kvm_gmem_prepare_folio() introduced in [1]), so we definitely don't want > to introduce a kernel data leak bug here. Actually it seems like filemap_grab_folio() already gets a zeroed page. filemap_grab_folio() eventually calls __alloc_pages_noprof() -> get_page_from_freelist() -> prep_new_page() -> post_alloc_hook() and post_alloc_hook() calls kernel_init_pages(), which zeroes the page, depending on kernel config. Paolo, was calling clear_highpage() in kvm_gmem_prepare_folio() zeroing an already empty page returned from filemap_grab_folio()? > In-place conversion does require preservation of data, so for > conversions, shall we zero depending on VM type? > > + Gunyah: don't zero since ->prepare_inaccessible() is a no-op > + pKVM: don't zero > + TDX: don't zero > + SEV: AMD Architecture Programmers Manual 7.10.6 says there is no > automatic encryption and implies no zeroing, hence perform zeroing > + KVM_X86_SW_PROTECTED_VM: Doesn't have a formal definition so I guess > we could require zeroing on transition? > > This way, the uptodate flag means that it has been prepared (as in > sev_gmem_prepare()), and zeroed if required by VM type. > > Regarding flushing the dcache/tlb in your other question [2], if we > don't use folio_zero_user(), can we relying on unmapping within core-mm > to flush after shared use, and unmapping within KVM To flush after > private use? > > Or should flush_dcache_folio() be explicitly called on kvm_gmem_fault()? > > clear_highpage(), used in the non-hugetlb (original) path, doesn't flush > the dcache. Was that intended? > >> Thanks, >> Elliot >> >>> >>> <snip> > > [1] https://lore.kernel.org/all/20240726185157.72821-8-pbonzini@redhat.com/ > [2] https://lore.kernel.org/all/diqz34ldszp3.fsf@ackerleytng-ctop.c.googlers.com/
On Thu, Oct 03, 2024, Ackerley Tng wrote: > Ackerley Tng <ackerleytng@google.com> writes: > > > Elliot Berman <quic_eberman@quicinc.com> writes: > >> From x86 CoCo perspective, I think it also makes sense to not zero > >> the folio when changing faultiblity from private to shared: > >> - If guest is sharing some data with host, you've wiped the data and > >> guest has to copy again. > >> - Or, if SEV/TDX enforces that page is zero'd between transitions, > >> Linux has duplicated the work that trusted entity has already done. > >> > >> Fuad and I can help add some details for the conversion. Hopefully we > >> can figure out some of the plan at plumbers this week. > > > > Zeroing the page prevents leaking host data (see function docstring for > > kvm_gmem_prepare_folio() introduced in [1]), so we definitely don't want > > to introduce a kernel data leak bug here. > > Actually it seems like filemap_grab_folio() already gets a zeroed page. > > filemap_grab_folio() eventually calls __alloc_pages_noprof() > -> get_page_from_freelist() > -> prep_new_page() > -> post_alloc_hook() > > and post_alloc_hook() calls kernel_init_pages(), which zeroes the page, > depending on kernel config. > > Paolo, was calling clear_highpage() in kvm_gmem_prepare_folio() zeroing an > already empty page returned from filemap_grab_folio()? Yes and no. CONFIG_INIT_ON_ALLOC_DEFAULT_ON and init_on_alloc are very much hardening features, not functional behavior that other code _needs_ to be aware of. E.g. enabling init-on-alloc comes with a measurable performance cost. Ignoring hardening, the guest_memfd mapping specifically sets the gfp_mask to GFP_HIGHUSER, i.e. doesn't set __GFP_ZERO. That said, I wouldn't be opposed to skipping the clear_highpage() call when want_init_on_alloc() is true. Also, the intended behavior (or at least, what intended) of kvm_gmem_prepare_folio() was it would do clear_highpage() if and only if a trusted entity does NOT zero the page. Factoring that in is a bit harder, as it probably requires another arch hook (or providing an out-param from kvm_arch_gmem_prepare()). I.e. the want_init_on_alloc() case isn't the only time KVM could shave cycles by not redundantly zeroing memory.
© 2016 - 2024 Red Hat, Inc.