1G page support for guest_memfd

[RFC PATCH 30/39] KVM: guest_memfd: Handle folio preparation for guest_memfd mmap

Posted by Ackerley Tng 2 months, 2 weeks ago

Since guest_memfd now supports mmap(), folios have to be prepared
before they are faulted into userspace.

When memory attributes are switched between shared and private, the
up-to-date flags will be cleared.

Use the folio's up-to-date flag to indicate being ready for the guest
usage and can be used to mark whether the folio is ready for shared OR
private use.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>

---
 virt/kvm/guest_memfd.c | 131 ++++++++++++++++++++++++++++++++++++++++-
 virt/kvm/kvm_main.c    |   2 +
 virt/kvm/kvm_mm.h      |   7 +++
 3 files changed, 139 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 110c4bbb004b..fb292e542381 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -129,13 +129,29 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
 }
 
 /**
- * Use the uptodate flag to indicate that the folio is prepared for KVM's usage.
+ * Use folio's up-to-date flag to indicate that this folio is prepared for usage
+ * by the guest.
+ *
+ * This flag can be used whether the folio is prepared for PRIVATE or SHARED
+ * usage.
  */
 static inline void kvm_gmem_mark_prepared(struct folio *folio)
 {
 	folio_mark_uptodate(folio);
 }
 
+/**
+ * Use folio's up-to-date flag to indicate that this folio is not yet prepared for
+ * usage by the guest.
+ *
+ * This flag can be used whether the folio is prepared for PRIVATE or SHARED
+ * usage.
+ */
+static inline void kvm_gmem_clear_prepared(struct folio *folio)
+{
+	folio_clear_uptodate(folio);
+}
+
 /*
  * Process @folio, which contains @gfn, so that the guest can use it.
  * The folio must be locked and the gfn must be contained in @slot.
@@ -148,6 +164,12 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
 	pgoff_t index;
 	int r;
 
+	/*
+	 * Defensively zero folio to avoid leaking kernel memory in
+	 * uninitialized pages. This is important since pages can now be mapped
+	 * into userspace, where hardware (e.g. TDX) won't be clearing those
+	 * pages.
+	 */
 	if (folio_test_hugetlb(folio)) {
 		folio_zero_user(folio, folio->index << PAGE_SHIFT);
 	} else {
@@ -1017,6 +1039,7 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
 {
 	struct inode *inode;
 	struct folio *folio;
+	bool is_prepared;
 
 	inode = file_inode(vmf->vma->vm_file);
 	if (!kvm_gmem_is_faultable(inode, vmf->pgoff))
@@ -1026,6 +1049,31 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
 	if (!folio)
 		return VM_FAULT_SIGBUS;
 
+	is_prepared = folio_test_uptodate(folio);
+	if (!is_prepared) {
+		unsigned long nr_pages;
+		unsigned long i;
+
+		if (folio_test_hugetlb(folio)) {
+			folio_zero_user(folio, folio->index << PAGE_SHIFT);
+		} else {
+			/*
+			 * Defensively zero folio to avoid leaking kernel memory in
+			 * uninitialized pages. This is important since pages can now be
+			 * mapped into userspace, where hardware (e.g. TDX) won't be
+			 * clearing those pages.
+			 *
+			 * Will probably need a version of kvm_gmem_prepare_folio() to
+			 * prepare the page for SHARED use.
+			 */
+			nr_pages = folio_nr_pages(folio);
+			for (i = 0; i < nr_pages; i++)
+				clear_highpage(folio_page(folio, i));
+		}
+
+		kvm_gmem_mark_prepared(folio);
+	}
+
 	vmf->page = folio_file_page(folio, vmf->pgoff);
 	return VM_FAULT_LOCKED;
 }
@@ -1593,6 +1641,87 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
 }
 EXPORT_SYMBOL_GPL(kvm_gmem_populate);
 
+static void kvm_gmem_clear_prepared_range(struct inode *inode, pgoff_t start,
+					  pgoff_t end)
+{
+	pgoff_t index;
+
+	filemap_invalidate_lock_shared(inode->i_mapping);
+
+	/* TODO: replace iteration with filemap_get_folios() for efficiency. */
+	for (index = start; index < end;) {
+		struct folio *folio;
+
+		/* Don't use kvm_gmem_get_folio to avoid allocating */
+		folio = filemap_lock_folio(inode->i_mapping, index);
+		if (IS_ERR(folio)) {
+			++index;
+			continue;
+		}
+
+		kvm_gmem_clear_prepared(folio);
+
+		index = folio_next_index(folio);
+		folio_unlock(folio);
+		folio_put(folio);
+	}
+
+	filemap_invalidate_unlock_shared(inode->i_mapping);
+}
+
+/**
+ * Clear the prepared flag for all folios in gfn range [@start, @end) in memslot
+ * @slot.
+ */
+static void kvm_gmem_clear_prepared_slot(struct kvm_memory_slot *slot, gfn_t start,
+					 gfn_t end)
+{
+	pgoff_t start_offset;
+	pgoff_t end_offset;
+	struct file *file;
+
+	file = kvm_gmem_get_file(slot);
+	if (!file)
+		return;
+
+	start_offset = start - slot->base_gfn + slot->gmem.pgoff;
+	end_offset = end - slot->base_gfn + slot->gmem.pgoff;
+
+	kvm_gmem_clear_prepared_range(file_inode(file), start_offset, end_offset);
+
+	fput(file);
+}
+
+/**
+ * Clear the prepared flag for all folios for any slot in gfn range
+ * [@start, @end) in @kvm.
+ */
+void kvm_gmem_clear_prepared_vm(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+	int i;
+
+	lockdep_assert_held(&kvm->slots_lock);
+
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+		struct kvm_memslot_iter iter;
+		struct kvm_memslots *slots;
+
+		slots = __kvm_memslots(kvm, i);
+		kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
+			struct kvm_memory_slot *slot;
+			gfn_t gfn_start;
+			gfn_t gfn_end;
+
+			slot = iter.slot;
+			gfn_start = max(start, slot->base_gfn);
+			gfn_end = min(end, slot->base_gfn + slot->npages);
+
+			if (iter.slot->flags & KVM_MEM_GUEST_MEMFD)
+				kvm_gmem_clear_prepared_slot(iter.slot, gfn_start, gfn_end);
+		}
+	}
+}
+
 /**
  * Returns true if pages in range [@start, @end) in inode @inode have no
  * userspace mappings.
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1a7bbcc31b7e..255d27df7f5c 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2565,6 +2565,8 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
 		KVM_BUG_ON(r, kvm);
 	}
 
+	kvm_gmem_clear_prepared_vm(kvm, start, end);
+
 	kvm_handle_gfn_range(kvm, &post_set_range);
 
 out_unlock:
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index d8ff2b380d0e..25fd0d9f66cc 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -43,6 +43,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
 void kvm_gmem_unbind(struct kvm_memory_slot *slot);
 int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
 				   unsigned long attrs);
+void kvm_gmem_clear_prepared_vm(struct kvm *kvm, gfn_t start, gfn_t end);
 #else
 static inline void kvm_gmem_init(struct module *module)
 {
@@ -68,6 +69,12 @@ static inline int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start,
 	return 0;
 }
 
+static inline void kvm_gmem_clear_prepared_slots(struct kvm *kvm,
+						 gfn_t start, gfn_t end)
+{
+	WARN_ON_ONCE(1);
+}
+
 #endif /* CONFIG_KVM_PRIVATE_MEM */
 
 #endif /* __KVM_MM_H__ */
-- 
2.46.0.598.g6f2099f65c-goog

Re: [RFC PATCH 30/39] KVM: guest_memfd: Handle folio preparation for guest_memfd mmap

Posted by Elliot Berman 2 months, 2 weeks ago

On Tue, Sep 10, 2024 at 11:44:01PM +0000, Ackerley Tng wrote:
> Since guest_memfd now supports mmap(), folios have to be prepared
> before they are faulted into userspace.
> 
> When memory attributes are switched between shared and private, the
> up-to-date flags will be cleared.
> 
> Use the folio's up-to-date flag to indicate being ready for the guest
> usage and can be used to mark whether the folio is ready for shared OR
> private use.

Clearing the up-to-date flag also means that the page gets zero'd out
whenever it transitions between shared and private (either direction).
pKVM (Android) hypervisor policy can allow in-place conversion between
shared/private.

I believe the important thing is that sev_gmem_prepare() needs to be
called prior to giving page to guest. In my series, I had made a
->prepare_inaccessible() callback where KVM would only do this part.
When transitioning to inaccessible, only that callback would be made,
besides the bookkeeping. The folio zeroing happens once when allocating
the folio if the folio is initially accessible (faultable).

From x86 CoCo perspective, I think it also makes sense to not zero
the folio when changing faultiblity from private to shared:
 - If guest is sharing some data with host, you've wiped the data and
   guest has to copy again.
 - Or, if SEV/TDX enforces that page is zero'd between transitions,
   Linux has duplicated the work that trusted entity has already done.

Fuad and I can help add some details for the conversion. Hopefully we
can figure out some of the plan at plumbers this week.

Thanks,
Elliot

> 
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> 
> ---
>  virt/kvm/guest_memfd.c | 131 ++++++++++++++++++++++++++++++++++++++++-
>  virt/kvm/kvm_main.c    |   2 +
>  virt/kvm/kvm_mm.h      |   7 +++
>  3 files changed, 139 insertions(+), 1 deletion(-)
> 
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 110c4bbb004b..fb292e542381 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -129,13 +129,29 @@ static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slo
>  }
>  
>  /**
> - * Use the uptodate flag to indicate that the folio is prepared for KVM's usage.
> + * Use folio's up-to-date flag to indicate that this folio is prepared for usage
> + * by the guest.
> + *
> + * This flag can be used whether the folio is prepared for PRIVATE or SHARED
> + * usage.
>   */
>  static inline void kvm_gmem_mark_prepared(struct folio *folio)
>  {
>  	folio_mark_uptodate(folio);
>  }
>  
> +/**
> + * Use folio's up-to-date flag to indicate that this folio is not yet prepared for
> + * usage by the guest.
> + *
> + * This flag can be used whether the folio is prepared for PRIVATE or SHARED
> + * usage.
> + */
> +static inline void kvm_gmem_clear_prepared(struct folio *folio)
> +{
> +	folio_clear_uptodate(folio);
> +}
> +
>  /*
>   * Process @folio, which contains @gfn, so that the guest can use it.
>   * The folio must be locked and the gfn must be contained in @slot.
> @@ -148,6 +164,12 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
>  	pgoff_t index;
>  	int r;
>  
> +	/*
> +	 * Defensively zero folio to avoid leaking kernel memory in
> +	 * uninitialized pages. This is important since pages can now be mapped
> +	 * into userspace, where hardware (e.g. TDX) won't be clearing those
> +	 * pages.
> +	 */
>  	if (folio_test_hugetlb(folio)) {
>  		folio_zero_user(folio, folio->index << PAGE_SHIFT);
>  	} else {
> @@ -1017,6 +1039,7 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
>  {
>  	struct inode *inode;
>  	struct folio *folio;
> +	bool is_prepared;
>  
>  	inode = file_inode(vmf->vma->vm_file);
>  	if (!kvm_gmem_is_faultable(inode, vmf->pgoff))
> @@ -1026,6 +1049,31 @@ static vm_fault_t kvm_gmem_fault(struct vm_fault *vmf)
>  	if (!folio)
>  		return VM_FAULT_SIGBUS;
>  
> +	is_prepared = folio_test_uptodate(folio);
> +	if (!is_prepared) {
> +		unsigned long nr_pages;
> +		unsigned long i;
> +
> +		if (folio_test_hugetlb(folio)) {
> +			folio_zero_user(folio, folio->index << PAGE_SHIFT);
> +		} else {
> +			/*
> +			 * Defensively zero folio to avoid leaking kernel memory in
> +			 * uninitialized pages. This is important since pages can now be
> +			 * mapped into userspace, where hardware (e.g. TDX) won't be
> +			 * clearing those pages.
> +			 *
> +			 * Will probably need a version of kvm_gmem_prepare_folio() to
> +			 * prepare the page for SHARED use.
> +			 */
> +			nr_pages = folio_nr_pages(folio);
> +			for (i = 0; i < nr_pages; i++)
> +				clear_highpage(folio_page(folio, i));
> +		}
> +
> +		kvm_gmem_mark_prepared(folio);
> +	}
> +
>  	vmf->page = folio_file_page(folio, vmf->pgoff);
>  	return VM_FAULT_LOCKED;
>  }
> @@ -1593,6 +1641,87 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
>  }
>  EXPORT_SYMBOL_GPL(kvm_gmem_populate);
>  
> +static void kvm_gmem_clear_prepared_range(struct inode *inode, pgoff_t start,
> +					  pgoff_t end)
> +{
> +	pgoff_t index;
> +
> +	filemap_invalidate_lock_shared(inode->i_mapping);
> +
> +	/* TODO: replace iteration with filemap_get_folios() for efficiency. */
> +	for (index = start; index < end;) {
> +		struct folio *folio;
> +
> +		/* Don't use kvm_gmem_get_folio to avoid allocating */
> +		folio = filemap_lock_folio(inode->i_mapping, index);
> +		if (IS_ERR(folio)) {
> +			++index;
> +			continue;
> +		}
> +
> +		kvm_gmem_clear_prepared(folio);
> +
> +		index = folio_next_index(folio);
> +		folio_unlock(folio);
> +		folio_put(folio);
> +	}
> +
> +	filemap_invalidate_unlock_shared(inode->i_mapping);
> +}
> +
> +/**
> + * Clear the prepared flag for all folios in gfn range [@start, @end) in memslot
> + * @slot.
> + */
> +static void kvm_gmem_clear_prepared_slot(struct kvm_memory_slot *slot, gfn_t start,
> +					 gfn_t end)
> +{
> +	pgoff_t start_offset;
> +	pgoff_t end_offset;
> +	struct file *file;
> +
> +	file = kvm_gmem_get_file(slot);
> +	if (!file)
> +		return;
> +
> +	start_offset = start - slot->base_gfn + slot->gmem.pgoff;
> +	end_offset = end - slot->base_gfn + slot->gmem.pgoff;
> +
> +	kvm_gmem_clear_prepared_range(file_inode(file), start_offset, end_offset);
> +
> +	fput(file);
> +}
> +
> +/**
> + * Clear the prepared flag for all folios for any slot in gfn range
> + * [@start, @end) in @kvm.
> + */
> +void kvm_gmem_clear_prepared_vm(struct kvm *kvm, gfn_t start, gfn_t end)
> +{
> +	int i;
> +
> +	lockdep_assert_held(&kvm->slots_lock);
> +
> +	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
> +		struct kvm_memslot_iter iter;
> +		struct kvm_memslots *slots;
> +
> +		slots = __kvm_memslots(kvm, i);
> +		kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
> +			struct kvm_memory_slot *slot;
> +			gfn_t gfn_start;
> +			gfn_t gfn_end;
> +
> +			slot = iter.slot;
> +			gfn_start = max(start, slot->base_gfn);
> +			gfn_end = min(end, slot->base_gfn + slot->npages);
> +
> +			if (iter.slot->flags & KVM_MEM_GUEST_MEMFD)
> +				kvm_gmem_clear_prepared_slot(iter.slot, gfn_start, gfn_end);
> +		}
> +	}
> +}
> +
>  /**
>   * Returns true if pages in range [@start, @end) in inode @inode have no
>   * userspace mappings.
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 1a7bbcc31b7e..255d27df7f5c 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2565,6 +2565,8 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
>  		KVM_BUG_ON(r, kvm);
>  	}
>  
> +	kvm_gmem_clear_prepared_vm(kvm, start, end);
> +
>  	kvm_handle_gfn_range(kvm, &post_set_range);
>  
>  out_unlock:
> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
> index d8ff2b380d0e..25fd0d9f66cc 100644
> --- a/virt/kvm/kvm_mm.h
> +++ b/virt/kvm/kvm_mm.h
> @@ -43,6 +43,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
>  void kvm_gmem_unbind(struct kvm_memory_slot *slot);
>  int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
>  				   unsigned long attrs);
> +void kvm_gmem_clear_prepared_vm(struct kvm *kvm, gfn_t start, gfn_t end);
>  #else
>  static inline void kvm_gmem_init(struct module *module)
>  {
> @@ -68,6 +69,12 @@ static inline int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start,
>  	return 0;
>  }
>  
> +static inline void kvm_gmem_clear_prepared_slots(struct kvm *kvm,
> +						 gfn_t start, gfn_t end)
> +{
> +	WARN_ON_ONCE(1);
> +}
> +
>  #endif /* CONFIG_KVM_PRIVATE_MEM */
>  
>  #endif /* __KVM_MM_H__ */
> -- 
> 2.46.0.598.g6f2099f65c-goog
>

Re: [RFC PATCH 30/39] KVM: guest_memfd: Handle folio preparation for guest_memfd mmap

Posted by Ackerley Tng 1 month, 3 weeks ago

Elliot Berman <quic_eberman@quicinc.com> writes:

> On Tue, Sep 10, 2024 at 11:44:01PM +0000, Ackerley Tng wrote:
>> Since guest_memfd now supports mmap(), folios have to be prepared
>> before they are faulted into userspace.
>>
>> When memory attributes are switched between shared and private, the
>> up-to-date flags will be cleared.
>>
>> Use the folio's up-to-date flag to indicate being ready for the guest
>> usage and can be used to mark whether the folio is ready for shared OR
>> private use.
>
> Clearing the up-to-date flag also means that the page gets zero'd out
> whenever it transitions between shared and private (either direction).
> pKVM (Android) hypervisor policy can allow in-place conversion between
> shared/private.
>
> I believe the important thing is that sev_gmem_prepare() needs to be
> called prior to giving page to guest. In my series, I had made a
> ->prepare_inaccessible() callback where KVM would only do this part.
> When transitioning to inaccessible, only that callback would be made,
> besides the bookkeeping. The folio zeroing happens once when allocating
> the folio if the folio is initially accessible (faultable).
>
> From x86 CoCo perspective, I think it also makes sense to not zero
> the folio when changing faultiblity from private to shared:
>  - If guest is sharing some data with host, you've wiped the data and
>    guest has to copy again.
>  - Or, if SEV/TDX enforces that page is zero'd between transitions,
>    Linux has duplicated the work that trusted entity has already done.
>
> Fuad and I can help add some details for the conversion. Hopefully we
> can figure out some of the plan at plumbers this week.

Zeroing the page prevents leaking host data (see function docstring for
kvm_gmem_prepare_folio() introduced in [1]), so we definitely don't want
to introduce a kernel data leak bug here.

In-place conversion does require preservation of data, so for
conversions, shall we zero depending on VM type?

+ Gunyah: don't zero since ->prepare_inaccessible() is a no-op
+ pKVM: don't zero
+ TDX: don't zero
+ SEV: AMD Architecture Programmers Manual 7.10.6 says there is no
  automatic encryption and implies no zeroing, hence perform zeroing
+ KVM_X86_SW_PROTECTED_VM: Doesn't have a formal definition so I guess
  we could require zeroing on transition?

This way, the uptodate flag means that it has been prepared (as in
sev_gmem_prepare()), and zeroed if required by VM type.

Regarding flushing the dcache/tlb in your other question [2], if we
don't use folio_zero_user(), can we relying on unmapping within core-mm
to flush after shared use, and unmapping within KVM To flush after
private use?

Or should flush_dcache_folio() be explicitly called on kvm_gmem_fault()?

clear_highpage(), used in the non-hugetlb (original) path, doesn't flush
the dcache. Was that intended?

> Thanks,
> Elliot
>
>>
>> <snip>

[1] https://lore.kernel.org/all/20240726185157.72821-8-pbonzini@redhat.com/
[2] https://lore.kernel.org/all/diqz34ldszp3.fsf@ackerleytng-ctop.c.googlers.com/

Re: [RFC PATCH 30/39] KVM: guest_memfd: Handle folio preparation for guest_memfd mmap

Posted by Patrick Roy 1 month, 3 weeks ago

Hi Ackerley,

On Thu, 2024-10-03 at 22:32 +0100, Ackerley Tng wrote:
> Elliot Berman <quic_eberman@quicinc.com> writes:
> 
>> On Tue, Sep 10, 2024 at 11:44:01PM +0000, Ackerley Tng wrote:
>>> Since guest_memfd now supports mmap(), folios have to be prepared
>>> before they are faulted into userspace.
>>>
>>> When memory attributes are switched between shared and private, the
>>> up-to-date flags will be cleared.
>>>
>>> Use the folio's up-to-date flag to indicate being ready for the guest
>>> usage and can be used to mark whether the folio is ready for shared OR
>>> private use.
>>
>> Clearing the up-to-date flag also means that the page gets zero'd out
>> whenever it transitions between shared and private (either direction).
>> pKVM (Android) hypervisor policy can allow in-place conversion between
>> shared/private.
>>
>> I believe the important thing is that sev_gmem_prepare() needs to be
>> called prior to giving page to guest. In my series, I had made a
>> ->prepare_inaccessible() callback where KVM would only do this part.
>> When transitioning to inaccessible, only that callback would be made,
>> besides the bookkeeping. The folio zeroing happens once when allocating
>> the folio if the folio is initially accessible (faultable).
>>
>> From x86 CoCo perspective, I think it also makes sense to not zero
>> the folio when changing faultiblity from private to shared:
>>  - If guest is sharing some data with host, you've wiped the data and
>>    guest has to copy again.
>>  - Or, if SEV/TDX enforces that page is zero'd between transitions,
>>    Linux has duplicated the work that trusted entity has already done.
>>
>> Fuad and I can help add some details for the conversion. Hopefully we
>> can figure out some of the plan at plumbers this week.
> 
> Zeroing the page prevents leaking host data (see function docstring for
> kvm_gmem_prepare_folio() introduced in [1]), so we definitely don't want
> to introduce a kernel data leak bug here.
> 
> In-place conversion does require preservation of data, so for
> conversions, shall we zero depending on VM type?
> 
> + Gunyah: don't zero since ->prepare_inaccessible() is a no-op
> + pKVM: don't zero
> + TDX: don't zero
> + SEV: AMD Architecture Programmers Manual 7.10.6 says there is no
>   automatic encryption and implies no zeroing, hence perform zeroing
> + KVM_X86_SW_PROTECTED_VM: Doesn't have a formal definition so I guess
>   we could require zeroing on transition?

Maybe for KVM_X86_SW_PROTECTED_VM we could make zero-ing configurable
via some CREATE_GUEST_MEMFD flag, instead of forcing one specific
behavior. 

For the "non-CoCo with direct map entries removed" VMs that we at AWS
are going for, we'd like a VM type with host-controlled in-place
conversions which doesn't zero on transitions, so if
KVM_X86_SW_PROTECTED_VM ends up zeroing, we'd need to add another new VM
type for that.

Somewhat related sidenote: For VMs that allow inplace conversions and do
not zero, we do not need to zap the stage-2 mappings on memory attribute
changes, right?

> This way, the uptodate flag means that it has been prepared (as in
> sev_gmem_prepare()), and zeroed if required by VM type.
> 
> Regarding flushing the dcache/tlb in your other question [2], if we
> don't use folio_zero_user(), can we relying on unmapping within core-mm
> to flush after shared use, and unmapping within KVM To flush after
> private use?
> 
> Or should flush_dcache_folio() be explicitly called on kvm_gmem_fault()?
> 
> clear_highpage(), used in the non-hugetlb (original) path, doesn't flush
> the dcache. Was that intended?
> 
>> Thanks,
>> Elliot
>>
>>>
>>> <snip>
> 
> [1] https://lore.kernel.org/all/20240726185157.72821-8-pbonzini@redhat.com/
> [2] https://lore.kernel.org/all/diqz34ldszp3.fsf@ackerleytng-ctop.c.googlers.com/

Best, 
Patrick

Re: [RFC PATCH 30/39] KVM: guest_memfd: Handle folio preparation for guest_memfd mmap

Posted by Ackerley Tng 1 month, 3 weeks ago

Patrick Roy <roypat@amazon.co.uk> writes:

> Hi Ackerley,
>
> On Thu, 2024-10-03 at 22:32 +0100, Ackerley Tng wrote:
>> Elliot Berman <quic_eberman@quicinc.com> writes:
>>
>>> On Tue, Sep 10, 2024 at 11:44:01PM +0000, Ackerley Tng wrote:
>>>> Since guest_memfd now supports mmap(), folios have to be prepared
>>>> before they are faulted into userspace.
>>>>
>>>> When memory attributes are switched between shared and private, the
>>>> up-to-date flags will be cleared.
>>>>
>>>> Use the folio's up-to-date flag to indicate being ready for the guest
>>>> usage and can be used to mark whether the folio is ready for shared OR
>>>> private use.
>>>
>>> Clearing the up-to-date flag also means that the page gets zero'd out
>>> whenever it transitions between shared and private (either direction).
>>> pKVM (Android) hypervisor policy can allow in-place conversion between
>>> shared/private.
>>>
>>> I believe the important thing is that sev_gmem_prepare() needs to be
>>> called prior to giving page to guest. In my series, I had made a
>>> ->prepare_inaccessible() callback where KVM would only do this part.
>>> When transitioning to inaccessible, only that callback would be made,
>>> besides the bookkeeping. The folio zeroing happens once when allocating
>>> the folio if the folio is initially accessible (faultable).
>>>
>>> From x86 CoCo perspective, I think it also makes sense to not zero
>>> the folio when changing faultiblity from private to shared:
>>>  - If guest is sharing some data with host, you've wiped the data and
>>>    guest has to copy again.
>>>  - Or, if SEV/TDX enforces that page is zero'd between transitions,
>>>    Linux has duplicated the work that trusted entity has already done.
>>>
>>> Fuad and I can help add some details for the conversion. Hopefully we
>>> can figure out some of the plan at plumbers this week.
>>
>> Zeroing the page prevents leaking host data (see function docstring for
>> kvm_gmem_prepare_folio() introduced in [1]), so we definitely don't want
>> to introduce a kernel data leak bug here.
>>
>> In-place conversion does require preservation of data, so for
>> conversions, shall we zero depending on VM type?
>>
>> + Gunyah: don't zero since ->prepare_inaccessible() is a no-op
>> + pKVM: don't zero
>> + TDX: don't zero
>> + SEV: AMD Architecture Programmers Manual 7.10.6 says there is no
>>   automatic encryption and implies no zeroing, hence perform zeroing
>> + KVM_X86_SW_PROTECTED_VM: Doesn't have a formal definition so I guess
>>   we could require zeroing on transition?
>
> Maybe for KVM_X86_SW_PROTECTED_VM we could make zero-ing configurable
> via some CREATE_GUEST_MEMFD flag, instead of forcing one specific
> behavior.

Sounds good to me, I can set up a flag in the next revision.

> For the "non-CoCo with direct map entries removed" VMs that we at AWS
> are going for, we'd like a VM type with host-controlled in-place
> conversions which doesn't zero on transitions, so if
> KVM_X86_SW_PROTECTED_VM ends up zeroing, we'd need to add another new VM
> type for that.
>
> Somewhat related sidenote: For VMs that allow inplace conversions and do
> not zero, we do not need to zap the stage-2 mappings on memory attribute
> changes, right?
>

Here are some reasons for zapping I can think of:

1. When private pages are split/merged, zapping the stage-2 mappings on
   memory attribute changes allows the private pages to be re-faulted by
   KVM at smaller/larger granularity.

2. The rationale described here
   https://elixir.bootlin.com/linux/v6.11.2/source/arch/x86/kvm/mmu/mmu.c#L7482
   ("Zapping SPTEs in this case ensures KVM will reassess whether or not
   a hugepage can be used for affected ranges.") probably refers to the
   existing implementation, when a different set of physical pages is
   used to back shared and private memory. When the same set of physical
   pages is used for both shared and private memory, then IIUC this
   rationale does not apply.

3. There's another rationale for zapping
   https://elixir.bootlin.com/linux/v6.11.2/source/virt/kvm/kvm_main.c#L2494
   to do with read vs write mappings here. I don't fully understand
   this, does this rationale still apply?

4. Is zapping required if the pages get removed/added to kernel direct
   map?

>> This way, the uptodate flag means that it has been prepared (as in
>> sev_gmem_prepare()), and zeroed if required by VM type.
>>
>> Regarding flushing the dcache/tlb in your other question [2], if we
>> don't use folio_zero_user(), can we relying on unmapping within core-mm
>> to flush after shared use, and unmapping within KVM To flush after
>> private use?
>>
>> Or should flush_dcache_folio() be explicitly called on kvm_gmem_fault()?
>>
>> clear_highpage(), used in the non-hugetlb (original) path, doesn't flush
>> the dcache. Was that intended?
>>
>>> Thanks,
>>> Elliot
>>>
>>>>
>>>> <snip>
>>
>> [1] https://lore.kernel.org/all/20240726185157.72821-8-pbonzini@redhat.com/
>> [2] https://lore.kernel.org/all/diqz34ldszp3.fsf@ackerleytng-ctop.c.googlers.com/
>
> Best,
> Patrick

Re: [RFC PATCH 30/39] KVM: guest_memfd: Handle folio preparation for guest_memfd mmap

Posted by Sean Christopherson 1 month, 3 weeks ago

On Tue, Oct 08, 2024, Ackerley Tng wrote:
> Patrick Roy <roypat@amazon.co.uk> writes:
> > For the "non-CoCo with direct map entries removed" VMs that we at AWS
> > are going for, we'd like a VM type with host-controlled in-place
> > conversions which doesn't zero on transitions,

Hmm, your use case shouldn't need conversions _for KVM_, as there's no need for
KVM to care if userspace or the guest _wants_ a page to be shared vs. private.
Userspace is fully trusted to manage things; KVM simply reacts to the current
state of things.

And more importantly, whether or not the direct map is zapped needs to be a
property of the guest_memfd inode, i.e. can't be associated with a struct kvm.
I forget who got volunteered to do the work, but we're going to need similar
functionality for tracking the state of individual pages in a huge folio, as
folio_mark_uptodate() is too coarse-grained.  I.e. at some point, I expect that
guest_memfd will make it easy-ish to determine whether or not the direct map has
been obliterated.

The shared vs. private attributes tracking in KVM is still needed (I think), as
it communicates what userspace _wants_, whereas he guest_memfd machinery will
track what the state _is_.

> > so if KVM_X86_SW_PROTECTED_VM ends up zeroing, we'd need to add another new
> > VM type for that.

Maybe we should sneak in a s/KVM_X86_SW_PROTECTED_VM/KVM_X86_SW_HARDENED_VM rename?
The original thought behind "software protected VM" was to do a slow build of
something akin to pKVM, but realistically I don't think that idea is going anywhere.

Alternatively, depending on how KVM accesses guest memory that's been removed from
the direct map, another solution would be to allow "regular" VMs to bind memslots
to guest_memfd, i.e. if the non-CoCo use case needs/wnats to bind all memory to
guest_memfd, not just "private" mappings.

That's probably the biggest topic of discussion: how do we want to allow mapping
guest_memfd into the guest, without direct map entries, but while still allowing
KVM to access guest memory as needed, e.g. for shadow paging.  One approach is
your RFC, where KVM maps guest_memfd pfns on-demand.

Another (slightly crazy) approach would be use protection keys to provide the
security properties that you want, while giving KVM (and userspace) a quick-and-easy
override to access guest memory.

 1. mmap() guest_memfd into userpace with RW protections
 2. Configure PKRU to make guest_memfd memory inaccessible by default
 3. Swizzle PKRU on-demand when intentionally accessing guest memory

It's essentially the same idea as SMAP+STAC/CLAC, just applied to guest memory
instead of to usersepace memory.

The benefit of the PKRU approach is that there are no PTE modifications, and thus
no TLB flushes, and only the CPU that is access guest memory gains temporary
access.  The big downside is that it would be limited to modern hardware, but
that might be acceptable, especially if it simplifies KVM's implementation.

> > Somewhat related sidenote: For VMs that allow inplace conversions and do
> > not zero, we do not need to zap the stage-2 mappings on memory attribute
> > changes, right?

See above.  I don't think conversions by toggling the shared/private flag in
KVM's memory attributes is the right fit for your use case.

Re: [RFC PATCH 30/39] KVM: guest_memfd: Handle folio preparation for guest_memfd mmap

Posted by Patrick Roy 1 month, 2 weeks ago

On Tue, 2024-10-08 at 20:56 +0100, Sean Christopherson wrote:
> On Tue, Oct 08, 2024, Ackerley Tng wrote:
>> Patrick Roy <roypat@amazon.co.uk> writes:
>>> For the "non-CoCo with direct map entries removed" VMs that we at AWS
>>> are going for, we'd like a VM type with host-controlled in-place
>>> conversions which doesn't zero on transitions,
> 
> Hmm, your use case shouldn't need conversions _for KVM_, as there's no need for
> KVM to care if userspace or the guest _wants_ a page to be shared vs. private.
> Userspace is fully trusted to manage things; KVM simply reacts to the current
> state of things.
> 
> And more importantly, whether or not the direct map is zapped needs to be a
> property of the guest_memfd inode, i.e. can't be associated with a struct kvm.
> I forget who got volunteered to do the work,

I think me? At least we talked about it briefly

> but we're going to need similar
> functionality for tracking the state of individual pages in a huge folio, as
> folio_mark_uptodate() is too coarse-grained.  I.e. at some point, I expect that
> guest_memfd will make it easy-ish to determine whether or not the direct map has
> been obliterated.
> 
> The shared vs. private attributes tracking in KVM is still needed (I think), as
> it communicates what userspace _wants_, whereas he guest_memfd machinery will
> track what the state _is_.

If I'm understanding this patch series correctly, the approach taken
here is to force the KVM memory attributes and the internal guest_memfd
state to be in-sync, because the VMA from mmap()ing guest_memfd is
reflected back into the userspace_addr of the memslot. So, to me, in
this world, "direct map zapped iff
kvm_has_mem_attributes(KVM_MEMORY_ATTRIBUTES_PRIVATE)", with memory
attribute changes forcing the corresponding gmem state change. That's
why I was talking about conversions above.

I've played around with this locally, and since KVM seems to generally
use copy_from_user and friends to access the userspace_addr VMA, (aka
private mem that's reflected back into memslots here), with this things
like MMIO emulation can be oblivious to gmem's existence, since
copy_from_user and co don't require GUP or presence of direct map
entries (well, "oblivious" in the sense that things like kvm_read_guest
currently ignore memory attributes and unconditionally access
userspace_addr, which I suppose is not really wanted for VMs where
userspace_addr and guest_memfd aren't short-circuited like this). The
exception is kvm_clock, where the pv_time page would need to be
explicitly converted to shared to restore the direct map entry, although
I think we could just let userspace deal with making sure this page is
shared (and then, if gmem supports GUP on shared memory, even the
gfn_to_pfn_caches could work without gmem knowledge. Without GUP, we'd
still need a tiny hack in the uhva->pfn translation somewhere to handle
gmem vmas, but iirc you did mention that having kvm-clock be special
might be fine).

I guess it does come down to what you note below, answering the question
of "how does KVM internally access guest_memfd for non-CoCo VMs".  Is
there any way we can make uaccesses like above work? I've finally gotten
around to re-running some performance benchmarks of my on-demand
reinsertion patches with all the needed TLB flushes added, and my fio
benchmark on a virtio-blk device suffers a ~50% throughput regression,
which does not necessarily spark joy. And I think James H.  mentioned at
LPC that making the userfault stuff work with my patches would be quite
hard. All this in addition to you also not necessarily sounding too keen
on it either :D

>>> so if KVM_X86_SW_PROTECTED_VM ends up zeroing, we'd need to add another new
>>> VM type for that.
> 
> Maybe we should sneak in a s/KVM_X86_SW_PROTECTED_VM/KVM_X86_SW_HARDENED_VM rename?
> The original thought behind "software protected VM" was to do a slow build of
> something akin to pKVM, but realistically I don't think that idea is going anywhere.

Ah, admittedly I've thought of KVM_X86_SW_PROTECTED_VM as a bit of a
playground where various configurations other VM types enforce can be
mixed and matched (e.g. zero on conversions yes/no, direct map removal
yes/no) so more of a KVM_X86_GMEM_VM, but am happy to update my
understanding :) 

> Alternatively, depending on how KVM accesses guest memory that's been removed from
> the direct map, another solution would be to allow "regular" VMs to bind memslots
> to guest_memfd, i.e. if the non-CoCo use case needs/wnats to bind all memory to
> guest_memfd, not just "private" mappings.
> 
> That's probably the biggest topic of discussion: how do we want to allow mapping
> guest_memfd into the guest, without direct map entries, but while still allowing
> KVM to access guest memory as needed, e.g. for shadow paging.  One approach is
> your RFC, where KVM maps guest_memfd pfns on-demand.
> 
> Another (slightly crazy) approach would be use protection keys to provide the
> security properties that you want, while giving KVM (and userspace) a quick-and-easy
> override to access guest memory.
> 
>  1. mmap() guest_memfd into userpace with RW protections
>  2. Configure PKRU to make guest_memfd memory inaccessible by default
>  3. Swizzle PKRU on-demand when intentionally accessing guest memory
> 
> It's essentially the same idea as SMAP+STAC/CLAC, just applied to guest memory
> instead of to usersepace memory.
> 
> The benefit of the PKRU approach is that there are no PTE modifications, and thus
> no TLB flushes, and only the CPU that is access guest memory gains temporary
> access.  The big downside is that it would be limited to modern hardware, but
> that might be acceptable, especially if it simplifies KVM's implementation.

Mh, but we only have 16 protection keys, so we cannot give each VM a
unique one. And if all guest memory shares the same protection key, then
during the on-demand swizzling the CPU would get access to _all_ guest
memory on the host, which "feels" scary. What do you think, @Derek?

Does ARM have something equivalent, btw?

>>> Somewhat related sidenote: For VMs that allow inplace conversions and do
>>> not zero, we do not need to zap the stage-2 mappings on memory attribute
>>> changes, right?
> 
> See above.  I don't think conversions by toggling the shared/private flag in
> KVM's memory attributes is the right fit for your use case.

Re: [RFC PATCH 30/39] KVM: guest_memfd: Handle folio preparation for guest_memfd mmap

Posted by Ackerley Tng 1 month, 1 week ago

Patrick Roy <roypat@amazon.co.uk> writes:

> On Tue, 2024-10-08 at 20:56 +0100, Sean Christopherson wrote:
>> On Tue, Oct 08, 2024, Ackerley Tng wrote:
>>> Patrick Roy <roypat@amazon.co.uk> writes:
>>>> For the "non-CoCo with direct map entries removed" VMs that we at AWS
>>>> are going for, we'd like a VM type with host-controlled in-place
>>>> conversions which doesn't zero on transitions,
>> 
>> Hmm, your use case shouldn't need conversions _for KVM_, as there's no need for
>> KVM to care if userspace or the guest _wants_ a page to be shared vs. private.
>> Userspace is fully trusted to manage things; KVM simply reacts to the current
>> state of things.
>> 
>> And more importantly, whether or not the direct map is zapped needs to be a
>> property of the guest_memfd inode, i.e. can't be associated with a struct kvm.
>> I forget who got volunteered to do the work,
>
> I think me? At least we talked about it briefly
>
>> but we're going to need similar
>> functionality for tracking the state of individual pages in a huge folio, as
>> folio_mark_uptodate() is too coarse-grained.  I.e. at some point, I expect that
>> guest_memfd will make it easy-ish to determine whether or not the direct map has
>> been obliterated.
>> 
>> The shared vs. private attributes tracking in KVM is still needed (I think), as
>> it communicates what userspace _wants_, whereas he guest_memfd machinery will
>> track what the state _is_.
>
> If I'm understanding this patch series correctly, the approach taken
> here is to force the KVM memory attributes and the internal guest_memfd
> state to be in-sync, because the VMA from mmap()ing guest_memfd is
> reflected back into the userspace_addr of the memslot.

In this patch series, we're also using guest_memfd state (faultability
xarray) to prevent any future faults before checking that there are no
mappings. Further explanation at [1].

Reason (a) at [1] is what Sean describes above to be what userspace
_wants_ vs what the state _is_.

> So, to me, in
> this world, "direct map zapped iff
> kvm_has_mem_attributes(KVM_MEMORY_ATTRIBUTES_PRIVATE)", with memory
> attribute changes forcing the corresponding gmem state change. That's
> why I was talking about conversions above.

I think if we do continue to have state in guest_memfd, then direct map
removal should be based on guest_memfd's state, rather than
KVM_MEMORY_ATTRIBUTE_PRIVATE in mem_attr_array.

> I've played around with this locally, and since KVM seems to generally
> use copy_from_user and friends to access the userspace_addr VMA, (aka
> private mem that's reflected back into memslots here), with this things
> like MMIO emulation can be oblivious to gmem's existence, since
> copy_from_user and co don't require GUP or presence of direct map
> entries (well, "oblivious" in the sense that things like kvm_read_guest
> currently ignore memory attributes and unconditionally access
> userspace_addr, which I suppose is not really wanted for VMs where
> userspace_addr and guest_memfd aren't short-circuited like this). The
> exception is kvm_clock, where the pv_time page would need to be
> explicitly converted to shared to restore the direct map entry, although
> I think we could just let userspace deal with making sure this page is
> shared (and then, if gmem supports GUP on shared memory, even the
> gfn_to_pfn_caches could work without gmem knowledge. Without GUP, we'd
> still need a tiny hack in the uhva->pfn translation somewhere to handle
> gmem vmas, but iirc you did mention that having kvm-clock be special
> might be fine).
>
> I guess it does come down to what you note below, answering the question
> of "how does KVM internally access guest_memfd for non-CoCo VMs".  Is
> there any way we can make uaccesses like above work? I've finally gotten
> around to re-running some performance benchmarks of my on-demand
> reinsertion patches with all the needed TLB flushes added, and my fio
> benchmark on a virtio-blk device suffers a ~50% throughput regression,
> which does not necessarily spark joy. And I think James H.  mentioned at
> LPC that making the userfault stuff work with my patches would be quite
> hard. All this in addition to you also not necessarily sounding too keen
> on it either :D
>
>>>> so if KVM_X86_SW_PROTECTED_VM ends up zeroing, we'd need to add another new
>>>> VM type for that.
>> 
>> Maybe we should sneak in a s/KVM_X86_SW_PROTECTED_VM/KVM_X86_SW_HARDENED_VM rename?
>> The original thought behind "software protected VM" was to do a slow build of
>> something akin to pKVM, but realistically I don't think that idea is going anywhere.
>
> Ah, admittedly I've thought of KVM_X86_SW_PROTECTED_VM as a bit of a
> playground where various configurations other VM types enforce can be
> mixed and matched (e.g. zero on conversions yes/no, direct map removal
> yes/no) so more of a KVM_X86_GMEM_VM, but am happy to update my
> understanding :) 
>

Given the different axes of possible configurations for guest_memfd
(zero on conversion, direct map removal), I think it's better to let
userspace choose, than to enumerate the combinations in VM types.

Independently of whether to use a flag or VM type to configure
guest_memfd, the "zeroed" state has to be stored somewhere.

For folios to at least be zeroed once, presence in the filemap could
indicated "zeroed".

Presence in the filemap may be awkward to use as an indication of
"zeroed" for the conversion case.

What else can we use to store "zeroed"? Suggestions: 

1. Since "prepared" already took the dirty bit on the folio, "zeroed"
   can use the checked bit on the folio. [2] indicates that it is for
   filesystems, which sounds like guest_memfd :)
2. folio->private (which we may already need to use)
3. Another xarray

>> <snip>

[1] https://lore.kernel.org/all/diqz1q0qtqnd.fsf@ackerleytng-ctop.c.googlers.com/T/#ma6f828d7a50c4de8a2f829a16c9bb458b53d8f3f
[2] https://elixir.bootlin.com/linux/v6.11.4/source/include/linux/page-flags.h#L147

Re: [RFC PATCH 30/39] KVM: guest_memfd: Handle folio preparation for guest_memfd mmap

Posted by Patrick Roy 1 month, 1 week ago


On Fri, 2024-10-18 at 00:16 +0100, Ackerley Tng wrote:
> Patrick Roy <roypat@amazon.co.uk> writes:
> 
>> On Tue, 2024-10-08 at 20:56 +0100, Sean Christopherson wrote:
>>> On Tue, Oct 08, 2024, Ackerley Tng wrote:
>>>> Patrick Roy <roypat@amazon.co.uk> writes:
>>>>> For the "non-CoCo with direct map entries removed" VMs that we at AWS
>>>>> are going for, we'd like a VM type with host-controlled in-place
>>>>> conversions which doesn't zero on transitions,
>>>
>>> Hmm, your use case shouldn't need conversions _for KVM_, as there's no need for
>>> KVM to care if userspace or the guest _wants_ a page to be shared vs. private.
>>> Userspace is fully trusted to manage things; KVM simply reacts to the current
>>> state of things.
>>>
>>> And more importantly, whether or not the direct map is zapped needs to be a
>>> property of the guest_memfd inode, i.e. can't be associated with a struct kvm.
>>> I forget who got volunteered to do the work,
>>
>> I think me? At least we talked about it briefly
>>
>>> but we're going to need similar
>>> functionality for tracking the state of individual pages in a huge folio, as
>>> folio_mark_uptodate() is too coarse-grained.  I.e. at some point, I expect that
>>> guest_memfd will make it easy-ish to determine whether or not the direct map has
>>> been obliterated.
>>>
>>> The shared vs. private attributes tracking in KVM is still needed (I think), as
>>> it communicates what userspace _wants_, whereas he guest_memfd machinery will
>>> track what the state _is_.
>>
>> If I'm understanding this patch series correctly, the approach taken
>> here is to force the KVM memory attributes and the internal guest_memfd
>> state to be in-sync, because the VMA from mmap()ing guest_memfd is
>> reflected back into the userspace_addr of the memslot.
> 
> In this patch series, we're also using guest_memfd state (faultability
> xarray) to prevent any future faults before checking that there are no
> mappings. Further explanation at [1].
> 
> Reason (a) at [1] is what Sean describes above to be what userspace
> _wants_ vs what the state _is_.

Ah, I was missing that detail about faultability being disabled, yet
mem_attrs not being updated until all pins are actually gone. Thanks!

Mh, I'm probably not seeing it because of my lack with CoCo setups, but
how would pKVM not trusting userspace about conversions cause mem_attrs
and faultability go out of sync? Or generally, if the guest and
userspace have different ideas about what is shared, and userspace's
idea is stored in mem_attrs (or rather, the part where they can agree is
stored in mem_attrs?), where do we store the guest's view of it? Guest
page tables?

>> So, to me, in
>> this world, "direct map zapped iff
>> kvm_has_mem_attributes(KVM_MEMORY_ATTRIBUTES_PRIVATE)", with memory
>> attribute changes forcing the corresponding gmem state change. That's
>> why I was talking about conversions above.
> 
> I think if we do continue to have state in guest_memfd, then direct map
> removal should be based on guest_memfd's state, rather than
> KVM_MEMORY_ATTRIBUTE_PRIVATE in mem_attr_array.

I am not trying to argue against tracking it in guest_memfd, I'm just
wondering if mem attributes and direct map state would ever disagree.
But probably that's also just because of my confusion above :)

>> I've played around with this locally, and since KVM seems to generally
>> use copy_from_user and friends to access the userspace_addr VMA, (aka
>> private mem that's reflected back into memslots here), with this things
>> like MMIO emulation can be oblivious to gmem's existence, since
>> copy_from_user and co don't require GUP or presence of direct map
>> entries (well, "oblivious" in the sense that things like kvm_read_guest
>> currently ignore memory attributes and unconditionally access
>> userspace_addr, which I suppose is not really wanted for VMs where
>> userspace_addr and guest_memfd aren't short-circuited like this). The
>> exception is kvm_clock, where the pv_time page would need to be
>> explicitly converted to shared to restore the direct map entry, although
>> I think we could just let userspace deal with making sure this page is
>> shared (and then, if gmem supports GUP on shared memory, even the
>> gfn_to_pfn_caches could work without gmem knowledge. Without GUP, we'd
>> still need a tiny hack in the uhva->pfn translation somewhere to handle
>> gmem vmas, but iirc you did mention that having kvm-clock be special
>> might be fine).
>>
>> I guess it does come down to what you note below, answering the question
>> of "how does KVM internally access guest_memfd for non-CoCo VMs".  Is
>> there any way we can make uaccesses like above work? I've finally gotten
>> around to re-running some performance benchmarks of my on-demand
>> reinsertion patches with all the needed TLB flushes added, and my fio
>> benchmark on a virtio-blk device suffers a ~50% throughput regression,
>> which does not necessarily spark joy. And I think James H.  mentioned at
>> LPC that making the userfault stuff work with my patches would be quite
>> hard. All this in addition to you also not necessarily sounding too keen
>> on it either :D
>>
>>>>> so if KVM_X86_SW_PROTECTED_VM ends up zeroing, we'd need to add another new
>>>>> VM type for that.
>>>
>>> Maybe we should sneak in a s/KVM_X86_SW_PROTECTED_VM/KVM_X86_SW_HARDENED_VM rename?
>>> The original thought behind "software protected VM" was to do a slow build of
>>> something akin to pKVM, but realistically I don't think that idea is going anywhere.
>>
>> Ah, admittedly I've thought of KVM_X86_SW_PROTECTED_VM as a bit of a
>> playground where various configurations other VM types enforce can be
>> mixed and matched (e.g. zero on conversions yes/no, direct map removal
>> yes/no) so more of a KVM_X86_GMEM_VM, but am happy to update my
>> understanding :)
>>
> 
> Given the different axes of possible configurations for guest_memfd
> (zero on conversion, direct map removal), I think it's better to let
> userspace choose, than to enumerate the combinations in VM types.
> 
> Independently of whether to use a flag or VM type to configure
> guest_memfd, the "zeroed" state has to be stored somewhere.
> 
> For folios to at least be zeroed once, presence in the filemap could
> indicated "zeroed".
> 
> Presence in the filemap may be awkward to use as an indication of
> "zeroed" for the conversion case.
> 
> What else can we use to store "zeroed"? Suggestions:
> 
> 1. Since "prepared" already took the dirty bit on the folio, "zeroed"
>    can use the checked bit on the folio. [2] indicates that it is for
>    filesystems, which sounds like guest_memfd :)
> 2. folio->private (which we may already need to use)
> 3. Another xarray
> 
>>> <snip>
> 
> [1] https://lore.kernel.org/all/diqz1q0qtqnd.fsf@ackerleytng-ctop.c.googlers.com/T/#ma6f828d7a50c4de8a2f829a16c9bb458b53d8f3f
> [2] https://elixir.bootlin.com/linux/v6.11.4/source/include/linux/page-flags.h#L147

Re: [RFC PATCH 30/39] KVM: guest_memfd: Handle folio preparation for guest_memfd mmap

Posted by Manwaring, Derek 1 month, 2 weeks ago

On 2024-10-10 at 16:21+0000 Patrick Roy wrote:
> On Tue, 2024-10-08 at 20:56 +0100, Sean Christopherson wrote:
> > Another (slightly crazy) approach would be use protection keys to provide the
> > security properties that you want, while giving KVM (and userspace) a quick-and-easy
> > override to access guest memory.
> >
> >  1. mmap() guest_memfd into userpace with RW protections
> >  2. Configure PKRU to make guest_memfd memory inaccessible by default
> >  3. Swizzle PKRU on-demand when intentionally accessing guest memory
> >
> > It's essentially the same idea as SMAP+STAC/CLAC, just applied to guest memory
> > instead of to usersepace memory.
> >
> > The benefit of the PKRU approach is that there are no PTE modifications, and thus
> > no TLB flushes, and only the CPU that is access guest memory gains temporary
> > access.  The big downside is that it would be limited to modern hardware, but
> > that might be acceptable, especially if it simplifies KVM's implementation.
>
> Mh, but we only have 16 protection keys, so we cannot give each VM a
> unique one. And if all guest memory shares the same protection key, then
> during the on-demand swizzling the CPU would get access to _all_ guest
> memory on the host, which "feels" scary. What do you think, @Derek?

Yes I am concerned about this. I don't see a way to use protection keys
that would ensure the host kernel cannot be tricked by one guest into
speculatively accessing another guest's memory (unless we do a key per
vm, which like you say severely limits how many guests you can host).

> Does ARM have something equivalent, btw?

Yes - Permission Overlay Extension [1]. Although even the most recent
parts don't offer it. I don't see it in Neoverse V3 or Cortex-X4.

Derek


[1] https://lore.kernel.org/all/20240822151113.1479789-1-joey.gouly@arm.com/

Re: [RFC PATCH 30/39] KVM: guest_memfd: Handle folio preparation for guest_memfd mmap

Posted by Manwaring, Derek 1 month, 3 weeks ago

On 2024-10-08 at 19:56+0000 Sean Christopherson wrote:
> Another (slightly crazy) approach would be use protection keys to provide the
> security properties that you want, while giving KVM (and userspace) a quick-and-easy
> override to access guest memory.
>
>  1. mmap() guest_memfd into userpace with RW protections
>  2. Configure PKRU to make guest_memfd memory inaccessible by default
>  3. Swizzle PKRU on-demand when intentionally accessing guest memory
>
> It's essentially the same idea as SMAP+STAC/CLAC, just applied to guest memory
> instead of to usersepace memory.
>
> The benefit of the PKRU approach is that there are no PTE modifications, and thus
> no TLB flushes, and only the CPU that is access guest memory gains temporary
> access.  The big downside is that it would be limited to modern hardware, but
> that might be acceptable, especially if it simplifies KVM's implementation.

Yeah this might be worth it if it simplifies significantly. Jenkins et
al. showed MPK worked for stopping in-process Spectre V1 [1]. While
future hardware bugs are always possible, the host kernel would still
offer better protection overall since discovery of additional Spectre
approaches and gadgets in the kernel is more likely (I think it's a
bigger surface area than hardware-specific MPK transient execution
issues).

Patrick, we talked about this a couple weeks ago and ended up focusing
on within-userspace protection, but I see keys can also be used to stop
kernel access like Andrew's project he mentioned during Dave's MPK
session at LPC [2]. Andrew, could you share that here?

It's not clear to me how reliably the kernel prevents its own access to
such pages. I see a few papers that warrant more investigation:

"we found multiple interfaces that Linux, by design, provides for
accessing process memory that ignore PKU domains on a page." [3]

"Though Connor et al. demonstrate that existing MPK protections can be
bypassed by using the kernel as a confused deputy, compelling recent
work indicates that MPK operations can be made secure." [4]

Dave and others, if you're aware of resources clarifying how strong the
boundaries are, that would be helpful.

Derek

[1] https://www.cs.dartmouth.edu/~sws/pubs/jas2020.pdf
[2] https://www.youtube.com/watch?v=gEUeMfrNH94&t=1028s
[3] https://www.usenix.org/system/files/sec20-connor.pdf
[4] https://ics.uci.edu/~dabrowsa/kirth-eurosys22-pkru.pdf

Re: [RFC PATCH 30/39] KVM: guest_memfd: Handle folio preparation for guest_memfd mmap

Posted by Andrew Cooper 1 month, 3 weeks ago

On 09/10/2024 4:51 am, Manwaring, Derek wrote:
> On 2024-10-08 at 19:56+0000 Sean Christopherson wrote:
>> Another (slightly crazy) approach would be use protection keys to provide the
>> security properties that you want, while giving KVM (and userspace) a quick-and-easy
>> override to access guest memory.
>>
>>   1. mmap() guest_memfd into userpace with RW protections
>>   2. Configure PKRU to make guest_memfd memory inaccessible by default
>>   3. Swizzle PKRU on-demand when intentionally accessing guest memory
>>
>> It's essentially the same idea as SMAP+STAC/CLAC, just applied to guest memory
>> instead of to usersepace memory.
>>
>> The benefit of the PKRU approach is that there are no PTE modifications, and thus
>> no TLB flushes, and only the CPU that is access guest memory gains temporary
>> access.  The big downside is that it would be limited to modern hardware, but
>> that might be acceptable, especially if it simplifies KVM's implementation.
> Yeah this might be worth it if it simplifies significantly. Jenkins et
> al. showed MPK worked for stopping in-process Spectre V1 [1]. While
> future hardware bugs are always possible, the host kernel would still
> offer better protection overall since discovery of additional Spectre
> approaches and gadgets in the kernel is more likely (I think it's a
> bigger surface area than hardware-specific MPK transient execution
> issues).
>
> Patrick, we talked about this a couple weeks ago and ended up focusing
> on within-userspace protection, but I see keys can also be used to stop
> kernel access like Andrew's project he mentioned during Dave's MPK
> session at LPC [2]. Andrew, could you share that here?

This was in reference to PKS specifically (so Sapphire Rapids and
later), and also for Xen but the technique is general.

Allocate one supervisor key for the directmap (and other ranges wanting
protecting), and configure MSR_PKS[key]=AD by default.

Protection Keys were identified as being safe as a defence against
Meltdown.  At the time, only PKRU existed, and PKS was expected to have
been less overhead than KPTI on Skylake, which was even more frustrating
for those of us who'd begged for a supervisor form at the time.  What's
done is done.

The changes needed in main code would be accessors for directmap
pointers, because there needs to temporary AD-disable.  This would take
the form of 2x WRMSR, as opposed to a STAC/CLAC pair.

An area of concern is the overhead of the WRMSRs.  MSR_PKS is defined as
not-architecturally-serialising, but like STAC/CLAC probably comes with
model-dependent dispatch-serialising properties to prevent memory
accesses executing speculatively under the wrong protection key.

Also, for this strategy to be effective, you need to PKEY-tag all
aliases of the memory.

~Andrew

Re: [RFC PATCH 30/39] KVM: guest_memfd: Handle folio preparation for guest_memfd mmap

Posted by Ackerley Tng 1 month, 3 weeks ago

Ackerley Tng <ackerleytng@google.com> writes:

> Elliot Berman <quic_eberman@quicinc.com> writes:
>
>> On Tue, Sep 10, 2024 at 11:44:01PM +0000, Ackerley Tng wrote:
>>> Since guest_memfd now supports mmap(), folios have to be prepared
>>> before they are faulted into userspace.
>>>
>>> When memory attributes are switched between shared and private, the
>>> up-to-date flags will be cleared.
>>>
>>> Use the folio's up-to-date flag to indicate being ready for the guest
>>> usage and can be used to mark whether the folio is ready for shared OR
>>> private use.
>>
>> Clearing the up-to-date flag also means that the page gets zero'd out
>> whenever it transitions between shared and private (either direction).
>> pKVM (Android) hypervisor policy can allow in-place conversion between
>> shared/private.
>>
>> I believe the important thing is that sev_gmem_prepare() needs to be
>> called prior to giving page to guest. In my series, I had made a
>> ->prepare_inaccessible() callback where KVM would only do this part.
>> When transitioning to inaccessible, only that callback would be made,
>> besides the bookkeeping. The folio zeroing happens once when allocating
>> the folio if the folio is initially accessible (faultable).
>>
>> From x86 CoCo perspective, I think it also makes sense to not zero
>> the folio when changing faultiblity from private to shared:
>>  - If guest is sharing some data with host, you've wiped the data and
>>    guest has to copy again.
>>  - Or, if SEV/TDX enforces that page is zero'd between transitions,
>>    Linux has duplicated the work that trusted entity has already done.
>>
>> Fuad and I can help add some details for the conversion. Hopefully we
>> can figure out some of the plan at plumbers this week.
>
> Zeroing the page prevents leaking host data (see function docstring for
> kvm_gmem_prepare_folio() introduced in [1]), so we definitely don't want
> to introduce a kernel data leak bug here.

Actually it seems like filemap_grab_folio() already gets a zeroed page.

filemap_grab_folio() eventually calls __alloc_pages_noprof()
-> get_page_from_freelist()
   -> prep_new_page()
      -> post_alloc_hook()

and post_alloc_hook() calls kernel_init_pages(), which zeroes the page,
depending on kernel config.

Paolo, was calling clear_highpage() in kvm_gmem_prepare_folio() zeroing an
already empty page returned from filemap_grab_folio()?

> In-place conversion does require preservation of data, so for
> conversions, shall we zero depending on VM type?
>
> + Gunyah: don't zero since ->prepare_inaccessible() is a no-op
> + pKVM: don't zero
> + TDX: don't zero
> + SEV: AMD Architecture Programmers Manual 7.10.6 says there is no
>   automatic encryption and implies no zeroing, hence perform zeroing
> + KVM_X86_SW_PROTECTED_VM: Doesn't have a formal definition so I guess
>   we could require zeroing on transition?
>
> This way, the uptodate flag means that it has been prepared (as in
> sev_gmem_prepare()), and zeroed if required by VM type.
>
> Regarding flushing the dcache/tlb in your other question [2], if we
> don't use folio_zero_user(), can we relying on unmapping within core-mm
> to flush after shared use, and unmapping within KVM To flush after
> private use?
>
> Or should flush_dcache_folio() be explicitly called on kvm_gmem_fault()?
>
> clear_highpage(), used in the non-hugetlb (original) path, doesn't flush
> the dcache. Was that intended?
>
>> Thanks,
>> Elliot
>>
>>>
>>> <snip>
>
> [1] https://lore.kernel.org/all/20240726185157.72821-8-pbonzini@redhat.com/
> [2] https://lore.kernel.org/all/diqz34ldszp3.fsf@ackerleytng-ctop.c.googlers.com/

Re: [RFC PATCH 30/39] KVM: guest_memfd: Handle folio preparation for guest_memfd mmap

Posted by Sean Christopherson 1 month, 3 weeks ago

On Thu, Oct 03, 2024, Ackerley Tng wrote:
> Ackerley Tng <ackerleytng@google.com> writes:
> 
> > Elliot Berman <quic_eberman@quicinc.com> writes:
> >> From x86 CoCo perspective, I think it also makes sense to not zero
> >> the folio when changing faultiblity from private to shared:
> >>  - If guest is sharing some data with host, you've wiped the data and
> >>    guest has to copy again.
> >>  - Or, if SEV/TDX enforces that page is zero'd between transitions,
> >>    Linux has duplicated the work that trusted entity has already done.
> >>
> >> Fuad and I can help add some details for the conversion. Hopefully we
> >> can figure out some of the plan at plumbers this week.
> >
> > Zeroing the page prevents leaking host data (see function docstring for
> > kvm_gmem_prepare_folio() introduced in [1]), so we definitely don't want
> > to introduce a kernel data leak bug here.
> 
> Actually it seems like filemap_grab_folio() already gets a zeroed page.
> 
> filemap_grab_folio() eventually calls __alloc_pages_noprof()
> -> get_page_from_freelist()
>    -> prep_new_page()
>       -> post_alloc_hook()
> 
> and post_alloc_hook() calls kernel_init_pages(), which zeroes the page,
> depending on kernel config.
> 
> Paolo, was calling clear_highpage() in kvm_gmem_prepare_folio() zeroing an
> already empty page returned from filemap_grab_folio()?

Yes and no.  CONFIG_INIT_ON_ALLOC_DEFAULT_ON and init_on_alloc are very much
hardening features, not functional behavior that other code _needs_ to be aware
of.  E.g. enabling init-on-alloc comes with a measurable performance cost.

Ignoring hardening, the guest_memfd mapping specifically sets the gfp_mask to
GFP_HIGHUSER, i.e. doesn't set __GFP_ZERO.

That said, I wouldn't be opposed to skipping the clear_highpage() call when
want_init_on_alloc() is true.

Also, the intended behavior (or at least, what  intended) of kvm_gmem_prepare_folio()
was it would do clear_highpage() if and only if a trusted entity does NOT zero
the page.  Factoring that in is a bit harder, as it probably requires another
arch hook (or providing an out-param from kvm_arch_gmem_prepare()).  I.e. the
want_init_on_alloc() case isn't the only time KVM could shave cycles by not
redundantly zeroing memory.