[RFC PATCH 29/39] KVM: Handle conversions in the SET_MEMORY_ATTRIBUTES ioctl

Ackerley Tng posted 39 patches 2 months, 2 weeks ago
[RFC PATCH 29/39] KVM: Handle conversions in the SET_MEMORY_ATTRIBUTES ioctl
Posted by Ackerley Tng 2 months, 2 weeks ago
The key steps for a private to shared conversion are:

1. Unmap from guest page tables
2. Set pages associated with requested range in memslot to be
   faultable
3. Update kvm->mem_attr_array

The key steps for a shared to private conversion are:

1. Check and disallow set_memory_attributes if any page in the range
   is still mapped or pinned, by
   a. Updating guest_memfd's faultability to prevent future faulting
   b. Returning -EINVAL if any pages are still pinned.
2. Update kvm->mem_attr_array

Userspace VMM must ensure shared pages are not in use, since any
faults racing with this call will get a SIGBUS.

Co-developed-by: Ackerley Tng <ackerleytng@google.com>
Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Co-developed-by: Vishal Annapurve <vannapurve@google.com>
Signed-off-by: Vishal Annapurve <vannapurve@google.com>

---
 include/linux/kvm_host.h |   1 +
 virt/kvm/guest_memfd.c   | 207 +++++++++++++++++++++++++++++++++++++++
 virt/kvm/kvm_main.c      |  15 +++
 virt/kvm/kvm_mm.h        |   9 ++
 4 files changed, 232 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 79a6b1a63027..10993cd33e34 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2476,6 +2476,7 @@ typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
 
 long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
 		       kvm_gmem_populate_cb post_populate, void *opaque);
+
 #endif
 
 #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_INVALIDATE
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 1d4dfe0660ad..110c4bbb004b 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -1592,4 +1592,211 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
 	return ret && !i ? ret : i;
 }
 EXPORT_SYMBOL_GPL(kvm_gmem_populate);
+
+/**
+ * Returns true if pages in range [@start, @end) in inode @inode have no
+ * userspace mappings.
+ */
+static bool kvm_gmem_no_mappings_range(struct inode *inode, pgoff_t start, pgoff_t end)
+{
+	pgoff_t index;
+	bool checked_indices_unmapped;
+
+	filemap_invalidate_lock_shared(inode->i_mapping);
+
+	/* TODO: replace iteration with filemap_get_folios() for efficiency. */
+	checked_indices_unmapped = true;
+	for (index = start; checked_indices_unmapped && index < end;) {
+		struct folio *folio;
+
+		/* Don't use kvm_gmem_get_folio to avoid allocating */
+		folio = filemap_lock_folio(inode->i_mapping, index);
+		if (IS_ERR(folio)) {
+			++index;
+			continue;
+		}
+
+		if (folio_mapped(folio) || folio_maybe_dma_pinned(folio))
+			checked_indices_unmapped = false;
+		else
+			index = folio_next_index(folio);
+
+		folio_unlock(folio);
+		folio_put(folio);
+	}
+
+	filemap_invalidate_unlock_shared(inode->i_mapping);
+	return checked_indices_unmapped;
+}
+
+/**
+ * Returns true if pages in range [@start, @end) in memslot @slot have no
+ * userspace mappings.
+ */
+static bool kvm_gmem_no_mappings_slot(struct kvm_memory_slot *slot,
+				      gfn_t start, gfn_t end)
+{
+	pgoff_t offset_start;
+	pgoff_t offset_end;
+	struct file *file;
+	bool ret;
+
+	offset_start = start - slot->base_gfn + slot->gmem.pgoff;
+	offset_end = end - slot->base_gfn + slot->gmem.pgoff;
+
+	file = kvm_gmem_get_file(slot);
+	if (!file)
+		return false;
+
+	ret = kvm_gmem_no_mappings_range(file_inode(file), offset_start, offset_end);
+
+	fput(file);
+
+	return ret;
+}
+
+/**
+ * Returns true if pages in range [@start, @end) have no host userspace mappings.
+ */
+static bool kvm_gmem_no_mappings(struct kvm *kvm, gfn_t start, gfn_t end)
+{
+	int i;
+
+	lockdep_assert_held(&kvm->slots_lock);
+
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+		struct kvm_memslot_iter iter;
+		struct kvm_memslots *slots;
+
+		slots = __kvm_memslots(kvm, i);
+		kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
+			struct kvm_memory_slot *slot;
+			gfn_t gfn_start;
+			gfn_t gfn_end;
+
+			slot = iter.slot;
+			gfn_start = max(start, slot->base_gfn);
+			gfn_end = min(end, slot->base_gfn + slot->npages);
+
+			if (iter.slot->flags & KVM_MEM_GUEST_MEMFD &&
+			    !kvm_gmem_no_mappings_slot(iter.slot, gfn_start, gfn_end))
+				return false;
+		}
+	}
+
+	return true;
+}
+
+/**
+ * Set faultability of given range of gfns [@start, @end) in memslot @slot to
+ * @faultable.
+ */
+static void kvm_gmem_set_faultable_slot(struct kvm_memory_slot *slot, gfn_t start,
+					gfn_t end, bool faultable)
+{
+	pgoff_t start_offset;
+	pgoff_t end_offset;
+	struct file *file;
+
+	file = kvm_gmem_get_file(slot);
+	if (!file)
+		return;
+
+	start_offset = start - slot->base_gfn + slot->gmem.pgoff;
+	end_offset = end - slot->base_gfn + slot->gmem.pgoff;
+
+	WARN_ON(kvm_gmem_set_faultable(file_inode(file), start_offset, end_offset,
+				       faultable));
+
+	fput(file);
+}
+
+/**
+ * Set faultability of given range of gfns [@start, @end) in memslot @slot to
+ * @faultable.
+ */
+static void kvm_gmem_set_faultable_vm(struct kvm *kvm, gfn_t start, gfn_t end,
+				      bool faultable)
+{
+	int i;
+
+	lockdep_assert_held(&kvm->slots_lock);
+
+	for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
+		struct kvm_memslot_iter iter;
+		struct kvm_memslots *slots;
+
+		slots = __kvm_memslots(kvm, i);
+		kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
+			struct kvm_memory_slot *slot;
+			gfn_t gfn_start;
+			gfn_t gfn_end;
+
+			slot = iter.slot;
+			gfn_start = max(start, slot->base_gfn);
+			gfn_end = min(end, slot->base_gfn + slot->npages);
+
+			if (iter.slot->flags & KVM_MEM_GUEST_MEMFD) {
+				kvm_gmem_set_faultable_slot(slot, gfn_start,
+							    gfn_end, faultable);
+			}
+		}
+	}
+}
+
+/**
+ * Returns true if guest_memfd permits setting range [@start, @end) to PRIVATE.
+ *
+ * If memory is faulted in to host userspace and a request was made to set the
+ * memory to PRIVATE, the faulted in pages must not be pinned for the request to
+ * be permitted.
+ */
+static int kvm_gmem_should_set_attributes_private(struct kvm *kvm, gfn_t start,
+						  gfn_t end)
+{
+	kvm_gmem_set_faultable_vm(kvm, start, end, false);
+
+	if (kvm_gmem_no_mappings(kvm, start, end))
+		return 0;
+
+	kvm_gmem_set_faultable_vm(kvm, start, end, true);
+	return -EINVAL;
+}
+
+/**
+ * Returns true if guest_memfd permits setting range [@start, @end) to SHARED.
+ *
+ * Because this allows pages to be faulted in to userspace, this must only be
+ * called after the pages have been invalidated from guest page tables.
+ */
+static int kvm_gmem_should_set_attributes_shared(struct kvm *kvm, gfn_t start,
+						 gfn_t end)
+{
+	/* Always okay to set shared, hence set range faultable here. */
+	kvm_gmem_set_faultable_vm(kvm, start, end, true);
+
+	return 0;
+}
+
+/**
+ * Returns 0 if guest_memfd permits setting attributes @attrs for range [@start,
+ * @end) or negative error otherwise.
+ *
+ * If memory is faulted in to host userspace and a request was made to set the
+ * memory to PRIVATE, the faulted in pages must not be pinned for the request to
+ * be permitted.
+ *
+ * Because this may allow pages to be faulted in to userspace when requested to
+ * set attributes to shared, this must only be called after the pages have been
+ * invalidated from guest page tables.
+ */
+int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+				   unsigned long attrs)
+{
+	if (attrs & KVM_MEMORY_ATTRIBUTE_PRIVATE)
+		return kvm_gmem_should_set_attributes_private(kvm, start, end);
+	else
+		return kvm_gmem_should_set_attributes_shared(kvm, start, end);
+}
+
 #endif
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 92901656a0d4..1a7bbcc31b7e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2524,6 +2524,13 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
 		.on_lock = kvm_mmu_invalidate_end,
 		.may_block = true,
 	};
+	struct kvm_mmu_notifier_range error_set_range = {
+		.start = start,
+		.end = end,
+		.handler = (void *)kvm_null_fn,
+		.on_lock = kvm_mmu_invalidate_end,
+		.may_block = true,
+	};
 	unsigned long i;
 	void *entry;
 	int r = 0;
@@ -2548,6 +2555,10 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
 
 	kvm_handle_gfn_range(kvm, &pre_set_range);
 
+	r = kvm_gmem_should_set_attributes(kvm, start, end, attributes);
+	if (r)
+		goto err;
+
 	for (i = start; i < end; i++) {
 		r = xa_err(xa_store(&kvm->mem_attr_array, i, entry,
 				    GFP_KERNEL_ACCOUNT));
@@ -2560,6 +2571,10 @@ static int kvm_vm_set_mem_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
 	mutex_unlock(&kvm->slots_lock);
 
 	return r;
+
+err:
+	kvm_handle_gfn_range(kvm, &error_set_range);
+	goto out_unlock;
 }
 static int kvm_vm_ioctl_set_mem_attributes(struct kvm *kvm,
 					   struct kvm_memory_attributes *attrs)
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 715f19669d01..d8ff2b380d0e 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -41,6 +41,8 @@ int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
 int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
 		  unsigned int fd, loff_t offset);
 void kvm_gmem_unbind(struct kvm_memory_slot *slot);
+int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start, gfn_t end,
+				   unsigned long attrs);
 #else
 static inline void kvm_gmem_init(struct module *module)
 {
@@ -59,6 +61,13 @@ static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
 {
 	WARN_ON_ONCE(1);
 }
+
+static inline int kvm_gmem_should_set_attributes(struct kvm *kvm, gfn_t start,
+						 gfn_t end, unsigned long attrs)
+{
+	return 0;
+}
+
 #endif /* CONFIG_KVM_PRIVATE_MEM */
 
 #endif /* __KVM_MM_H__ */
-- 
2.46.0.598.g6f2099f65c-goog