[RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls

Ackerley Tng posted 51 patches 7 months, 1 week ago
[RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Ackerley Tng 7 months, 1 week ago
The two new guest_memfd ioctls KVM_GMEM_CONVERT_SHARED and
KVM_GMEM_CONVERT_PRIVATE convert the requested memory ranges to shared
and private respectively.

A guest_memfd ioctl is used because shareability is a property of the
memory, and this property should be modifiable independently of the
attached struct kvm. This allows shareability to be modified even if
the memory is not yet bound using memslots.

For shared to private conversions, if refcounts on any of the folios
within the range are elevated, fail the conversion with -EAGAIN.

At the point of shared to private conversion, all folios in range are
also unmapped. The filemap_invalidate_lock() is held, so no faulting
can occur. Hence, from that point on, only transient refcounts can be
taken on the folios associated with that guest_memfd.

Hence, it is safe to do the conversion from shared to private.

After conversion is complete, refcounts may become elevated, but that
is fine since users of transient refcounts don't actually access
memory.

For private to shared conversions, there are no refcount checks. any
transient refcounts are expected to drop their refcounts soon. The
conversion process will spin waiting for these transient refcounts to
go away.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>

Change-Id: I3546aaf6c1b795de6dc9ba09e816b64934221918
---
 include/uapi/linux/kvm.h |  11 ++
 virt/kvm/guest_memfd.c   | 357 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 366 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d7df312479aa..5b28e17f6f14 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -1577,6 +1577,17 @@ struct kvm_create_guest_memfd {
 	__u64 reserved[6];
 };
 
+#define KVM_GMEM_IO 0xAF
+#define KVM_GMEM_CONVERT_SHARED		_IOWR(KVM_GMEM_IO,  0x41, struct kvm_gmem_convert)
+#define KVM_GMEM_CONVERT_PRIVATE	_IOWR(KVM_GMEM_IO,  0x42, struct kvm_gmem_convert)
+
+struct kvm_gmem_convert {
+	__u64 offset;
+	__u64 size;
+	__u64 error_offset;
+	__u64 reserved[5];
+};
+
 #define KVM_PRE_FAULT_MEMORY	_IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
 
 struct kvm_pre_fault_memory {
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 590932499eba..f802116290ce 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -30,6 +30,10 @@ enum shareability {
 };
 
 static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
+static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
+				      pgoff_t end);
+static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
+				    pgoff_t end);
 
 static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
 {
@@ -85,6 +89,306 @@ static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t inde
 	return kvm_gmem_get_folio(inode, index);
 }
 
+/**
+ * kvm_gmem_shareability_store() - Sets shareability to @value for range.
+ *
+ * @mt: the shareability maple tree.
+ * @index: the range begins at this index in the inode.
+ * @nr_pages: number of PAGE_SIZE pages in this range.
+ * @value: the shareability value to set for this range.
+ *
+ * Unlike mtree_store_range(), this function also merges adjacent ranges that
+ * have the same values as an optimization. Assumes that all stores to @mt go
+ * through this function, such that adjacent ranges are always merged.
+ *
+ * Return: 0 on success and negative error otherwise.
+ */
+static int kvm_gmem_shareability_store(struct maple_tree *mt, pgoff_t index,
+				       size_t nr_pages, enum shareability value)
+{
+	MA_STATE(mas, mt, 0, 0);
+	unsigned long start;
+	unsigned long last;
+	void *entry;
+	int ret;
+
+	start = index;
+	last = start + nr_pages - 1;
+
+	mas_lock(&mas);
+
+	/* Try extending range. entry is NULL on overflow/wrap-around. */
+	mas_set_range(&mas, last + 1, last + 1);
+	entry = mas_find(&mas, last + 1);
+	if (entry && xa_to_value(entry) == value)
+		last = mas.last;
+
+	mas_set_range(&mas, start - 1, start - 1);
+	entry = mas_find(&mas, start - 1);
+	if (entry && xa_to_value(entry) == value)
+		start = mas.index;
+
+	mas_set_range(&mas, start, last);
+	ret = mas_store_gfp(&mas, xa_mk_value(value), GFP_KERNEL);
+
+	mas_unlock(&mas);
+
+	return ret;
+}
+
+struct conversion_work {
+	struct list_head list;
+	pgoff_t start;
+	size_t nr_pages;
+};
+
+static int add_to_work_list(struct list_head *list, pgoff_t start, pgoff_t last)
+{
+	struct conversion_work *work;
+
+	work = kzalloc(sizeof(*work), GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	work->start = start;
+	work->nr_pages = last + 1 - start;
+
+	list_add_tail(&work->list, list);
+
+	return 0;
+}
+
+static bool kvm_gmem_has_safe_refcount(struct address_space *mapping, pgoff_t start,
+				       size_t nr_pages, pgoff_t *error_index)
+{
+	const int filemap_get_folios_refcount = 1;
+	struct folio_batch fbatch;
+	bool refcount_safe;
+	pgoff_t last;
+	int i;
+
+	last = start + nr_pages - 1;
+	refcount_safe = true;
+
+	folio_batch_init(&fbatch);
+	while (refcount_safe &&
+	       filemap_get_folios(mapping, &start, last, &fbatch)) {
+
+		for (i = 0; i < folio_batch_count(&fbatch); ++i) {
+			int filemap_refcount;
+			int safe_refcount;
+			struct folio *f;
+
+			f = fbatch.folios[i];
+			filemap_refcount = folio_nr_pages(f);
+
+			safe_refcount = filemap_refcount + filemap_get_folios_refcount;
+			if (folio_ref_count(f) != safe_refcount) {
+				refcount_safe = false;
+				*error_index = f->index;
+				break;
+			}
+		}
+
+		folio_batch_release(&fbatch);
+	}
+
+	return refcount_safe;
+}
+
+static int kvm_gmem_shareability_apply(struct inode *inode,
+				       struct conversion_work *work,
+				       enum shareability m)
+{
+	struct maple_tree *mt;
+
+	mt = &kvm_gmem_private(inode)->shareability;
+	return kvm_gmem_shareability_store(mt, work->start, work->nr_pages, m);
+}
+
+static int kvm_gmem_convert_compute_work(struct inode *inode, pgoff_t start,
+					 size_t nr_pages, enum shareability m,
+					 struct list_head *work_list)
+{
+	struct maple_tree *mt;
+	struct ma_state mas;
+	pgoff_t last;
+	void *entry;
+	int ret;
+
+	last = start + nr_pages - 1;
+
+	mt = &kvm_gmem_private(inode)->shareability;
+	ret = 0;
+
+	mas_init(&mas, mt, start);
+
+	rcu_read_lock();
+	mas_for_each(&mas, entry, last) {
+		enum shareability current_m;
+		pgoff_t m_range_index;
+		pgoff_t m_range_last;
+		int ret;
+
+		m_range_index = max(mas.index, start);
+		m_range_last = min(mas.last, last);
+
+		current_m = xa_to_value(entry);
+		if (m == current_m)
+			continue;
+
+		mas_pause(&mas);
+		rcu_read_unlock();
+		/* Caller will clean this up on error. */
+		ret = add_to_work_list(work_list, m_range_index, m_range_last);
+		rcu_read_lock();
+		if (ret)
+			break;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static void kvm_gmem_convert_invalidate_begin(struct inode *inode,
+					      struct conversion_work *work)
+{
+	struct list_head *gmem_list;
+	struct kvm_gmem *gmem;
+	pgoff_t end;
+
+	end = work->start + work->nr_pages;
+
+	gmem_list = &inode->i_mapping->i_private_list;
+	list_for_each_entry(gmem, gmem_list, entry)
+		kvm_gmem_invalidate_begin(gmem, work->start, end);
+}
+
+static void kvm_gmem_convert_invalidate_end(struct inode *inode,
+					    struct conversion_work *work)
+{
+	struct list_head *gmem_list;
+	struct kvm_gmem *gmem;
+	pgoff_t end;
+
+	end = work->start + work->nr_pages;
+
+	gmem_list = &inode->i_mapping->i_private_list;
+	list_for_each_entry(gmem, gmem_list, entry)
+		kvm_gmem_invalidate_end(gmem, work->start, end);
+}
+
+static int kvm_gmem_convert_should_proceed(struct inode *inode,
+					   struct conversion_work *work,
+					   bool to_shared, pgoff_t *error_index)
+{
+	if (!to_shared) {
+		unmap_mapping_pages(inode->i_mapping, work->start,
+				    work->nr_pages, false);
+
+		if (!kvm_gmem_has_safe_refcount(inode->i_mapping, work->start,
+						work->nr_pages, error_index)) {
+			return -EAGAIN;
+		}
+	}
+
+	return 0;
+}
+
+static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
+				  size_t nr_pages, bool shared,
+				  pgoff_t *error_index)
+{
+	struct conversion_work *work, *tmp, *rollback_stop_item;
+	LIST_HEAD(work_list);
+	struct inode *inode;
+	enum shareability m;
+	int ret;
+
+	inode = file_inode(file);
+
+	filemap_invalidate_lock(inode->i_mapping);
+
+	m = shared ? SHAREABILITY_ALL : SHAREABILITY_GUEST;
+	ret = kvm_gmem_convert_compute_work(inode, start, nr_pages, m, &work_list);
+	if (ret || list_empty(&work_list))
+		goto out;
+
+	list_for_each_entry(work, &work_list, list)
+		kvm_gmem_convert_invalidate_begin(inode, work);
+
+	list_for_each_entry(work, &work_list, list) {
+		ret = kvm_gmem_convert_should_proceed(inode, work, shared,
+						      error_index);
+		if (ret)
+			goto invalidate_end;
+	}
+
+	list_for_each_entry(work, &work_list, list) {
+		rollback_stop_item = work;
+		ret = kvm_gmem_shareability_apply(inode, work, m);
+		if (ret)
+			break;
+	}
+
+	if (ret) {
+		m = shared ? SHAREABILITY_GUEST : SHAREABILITY_ALL;
+		list_for_each_entry(work, &work_list, list) {
+			if (work == rollback_stop_item)
+				break;
+
+			WARN_ON(kvm_gmem_shareability_apply(inode, work, m));
+		}
+	}
+
+invalidate_end:
+	list_for_each_entry(work, &work_list, list)
+		kvm_gmem_convert_invalidate_end(inode, work);
+out:
+	filemap_invalidate_unlock(inode->i_mapping);
+
+	list_for_each_entry_safe(work, tmp, &work_list, list) {
+		list_del(&work->list);
+		kfree(work);
+	}
+
+	return ret;
+}
+
+static int kvm_gmem_ioctl_convert_range(struct file *file,
+					struct kvm_gmem_convert *param,
+					bool shared)
+{
+	pgoff_t error_index;
+	size_t nr_pages;
+	pgoff_t start;
+	int ret;
+
+	if (param->error_offset)
+		return -EINVAL;
+
+	if (param->size == 0)
+		return 0;
+
+	if (param->offset + param->size < param->offset ||
+	    param->offset > file_inode(file)->i_size ||
+	    param->offset + param->size > file_inode(file)->i_size)
+		return -EINVAL;
+
+	if (!IS_ALIGNED(param->offset, PAGE_SIZE) ||
+	    !IS_ALIGNED(param->size, PAGE_SIZE))
+		return -EINVAL;
+
+	start = param->offset >> PAGE_SHIFT;
+	nr_pages = param->size >> PAGE_SHIFT;
+
+	ret = kvm_gmem_convert_range(file, start, nr_pages, shared, &error_index);
+	if (ret)
+		param->error_offset = error_index << PAGE_SHIFT;
+
+	return ret;
+}
+
 #else
 
 static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
@@ -186,15 +490,26 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
 	unsigned long index;
 
 	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
+		enum kvm_gfn_range_filter filter;
 		pgoff_t pgoff = slot->gmem.pgoff;
 
+		filter = KVM_FILTER_PRIVATE;
+		if (kvm_gmem_memslot_supports_shared(slot)) {
+			/*
+			 * Unmapping would also cause invalidation, but cannot
+			 * rely on mmu_notifiers to do invalidation via
+			 * unmapping, since memory may not be mapped to
+			 * userspace.
+			 */
+			filter |= KVM_FILTER_SHARED;
+		}
+
 		struct kvm_gfn_range gfn_range = {
 			.start = slot->base_gfn + max(pgoff, start) - pgoff,
 			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
 			.slot = slot,
 			.may_block = true,
-			/* guest memfd is relevant to only private mappings. */
-			.attr_filter = KVM_FILTER_PRIVATE,
+			.attr_filter = filter,
 		};
 
 		if (!found_memslot) {
@@ -484,11 +799,49 @@ EXPORT_SYMBOL_GPL(kvm_gmem_memslot_supports_shared);
 #define kvm_gmem_mmap NULL
 #endif /* CONFIG_KVM_GMEM_SHARED_MEM */
 
+static long kvm_gmem_ioctl(struct file *file, unsigned int ioctl,
+			   unsigned long arg)
+{
+	void __user *argp;
+	int r;
+
+	argp = (void __user *)arg;
+
+	switch (ioctl) {
+#ifdef CONFIG_KVM_GMEM_SHARED_MEM
+	case KVM_GMEM_CONVERT_SHARED:
+	case KVM_GMEM_CONVERT_PRIVATE: {
+		struct kvm_gmem_convert param;
+		bool to_shared;
+
+		r = -EFAULT;
+		if (copy_from_user(&param, argp, sizeof(param)))
+			goto out;
+
+		to_shared = ioctl == KVM_GMEM_CONVERT_SHARED;
+		r = kvm_gmem_ioctl_convert_range(file, &param, to_shared);
+		if (r) {
+			if (copy_to_user(argp, &param, sizeof(param))) {
+				r = -EFAULT;
+				goto out;
+			}
+		}
+		break;
+	}
+#endif
+	default:
+		r = -ENOTTY;
+	}
+out:
+	return r;
+}
+
 static struct file_operations kvm_gmem_fops = {
 	.mmap		= kvm_gmem_mmap,
 	.open		= generic_file_open,
 	.release	= kvm_gmem_release,
 	.fallocate	= kvm_gmem_fallocate,
+	.unlocked_ioctl	= kvm_gmem_ioctl,
 };
 
 static void kvm_gmem_free_inode(struct inode *inode)
-- 
2.49.0.1045.g170613ef41-goog
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Binbin Wu 6 months, 3 weeks ago

On 5/15/2025 7:41 AM, Ackerley Tng wrote:

[...]
> +
> +static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
> +				  size_t nr_pages, bool shared,
> +				  pgoff_t *error_index)
> +{
> +	struct conversion_work *work, *tmp, *rollback_stop_item;
> +	LIST_HEAD(work_list);
> +	struct inode *inode;
> +	enum shareability m;
> +	int ret;
> +
> +	inode = file_inode(file);
> +
> +	filemap_invalidate_lock(inode->i_mapping);
> +
> +	m = shared ? SHAREABILITY_ALL : SHAREABILITY_GUEST;
> +	ret = kvm_gmem_convert_compute_work(inode, start, nr_pages, m, &work_list);
> +	if (ret || list_empty(&work_list))
> +		goto out;
> +
> +	list_for_each_entry(work, &work_list, list)
> +		kvm_gmem_convert_invalidate_begin(inode, work);
> +
> +	list_for_each_entry(work, &work_list, list) {
> +		ret = kvm_gmem_convert_should_proceed(inode, work, shared,
> +						      error_index);

Since kvm_gmem_invalidate_begin() begins to handle shared memory,
kvm_gmem_convert_invalidate_begin() will zap the table.
The shared mapping could be zapped in kvm_gmem_convert_invalidate_begin() even
when kvm_gmem_convert_should_proceed() returns error.
The sequence is a bit confusing to me, at least in this patch so far.

> +		if (ret)
> +			goto invalidate_end;
> +	}
> +
> +	list_for_each_entry(work, &work_list, list) {
> +		rollback_stop_item = work;
> +		ret = kvm_gmem_shareability_apply(inode, work, m);
> +		if (ret)
> +			break;
> +	}
> +
> +	if (ret) {
> +		m = shared ? SHAREABILITY_GUEST : SHAREABILITY_ALL;
> +		list_for_each_entry(work, &work_list, list) {
> +			if (work == rollback_stop_item)
> +				break;
> +
> +			WARN_ON(kvm_gmem_shareability_apply(inode, work, m));
> +		}
> +	}
> +
> +invalidate_end:
> +	list_for_each_entry(work, &work_list, list)
> +		kvm_gmem_convert_invalidate_end(inode, work);
> +out:
> +	filemap_invalidate_unlock(inode->i_mapping);
> +
> +	list_for_each_entry_safe(work, tmp, &work_list, list) {
> +		list_del(&work->list);
> +		kfree(work);
> +	}
> +
> +	return ret;
> +}
> +
[...]
> @@ -186,15 +490,26 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
>   	unsigned long index;
>   
>   	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> +		enum kvm_gfn_range_filter filter;
>   		pgoff_t pgoff = slot->gmem.pgoff;
>   
> +		filter = KVM_FILTER_PRIVATE;
> +		if (kvm_gmem_memslot_supports_shared(slot)) {
> +			/*
> +			 * Unmapping would also cause invalidation, but cannot
> +			 * rely on mmu_notifiers to do invalidation via
> +			 * unmapping, since memory may not be mapped to
> +			 * userspace.
> +			 */
> +			filter |= KVM_FILTER_SHARED;
> +		}
> +
>   		struct kvm_gfn_range gfn_range = {
>   			.start = slot->base_gfn + max(pgoff, start) - pgoff,
>   			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
>   			.slot = slot,
>   			.may_block = true,
> -			/* guest memfd is relevant to only private mappings. */
> -			.attr_filter = KVM_FILTER_PRIVATE,
> +			.attr_filter = filter,
>   		};
>   
>   		if (!found_memslot) {
> @@ -484,11 +799,49 @@ EXPORT_SYMBOL_GPL(kvm_gmem_memslot_supports_shared);
>   #define kvm_gmem_mmap NULL
>   #endif /* CONFIG_KVM_GMEM_SHARED_MEM */
>   
[...]
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Ackerley Tng 6 months, 3 weeks ago
Binbin Wu <binbin.wu@linux.intel.com> writes:

> On 5/15/2025 7:41 AM, Ackerley Tng wrote:
>
> [...]
>> +
>> +static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
>> +				  size_t nr_pages, bool shared,
>> +				  pgoff_t *error_index)
>> +{
>> +	struct conversion_work *work, *tmp, *rollback_stop_item;
>> +	LIST_HEAD(work_list);
>> +	struct inode *inode;
>> +	enum shareability m;
>> +	int ret;
>> +
>> +	inode = file_inode(file);
>> +
>> +	filemap_invalidate_lock(inode->i_mapping);
>> +
>> +	m = shared ? SHAREABILITY_ALL : SHAREABILITY_GUEST;
>> +	ret = kvm_gmem_convert_compute_work(inode, start, nr_pages, m, &work_list);
>> +	if (ret || list_empty(&work_list))
>> +		goto out;
>> +
>> +	list_for_each_entry(work, &work_list, list)
>> +		kvm_gmem_convert_invalidate_begin(inode, work);
>> +
>> +	list_for_each_entry(work, &work_list, list) {
>> +		ret = kvm_gmem_convert_should_proceed(inode, work, shared,
>> +						      error_index);
>
> Since kvm_gmem_invalidate_begin() begins to handle shared memory,
> kvm_gmem_convert_invalidate_begin() will zap the table.
> The shared mapping could be zapped in kvm_gmem_convert_invalidate_begin() even
> when kvm_gmem_convert_should_proceed() returns error.
> The sequence is a bit confusing to me, at least in this patch so far.
>

It is true that zapping of pages from the guest page table will happen
before we figure out whether conversion is allowed.

For a shared-to-private conversion, we will definitely unmap from the
host before checking if conversion is allowed, and there's no choice
there since conversion is allowed if there are no unexpected refcounts,
and the way to eliminate expected refcounts is to unmap from the host.

Since we're unmapping before checking if conversion is allowed, I
thought it would be fine to also zap from guest page tables before
checking if conversion is allowed.

Conversion is not meant to happen very regularly, and even if it is
unmapped or zapped, the next access will fault in the page anyway, so
there is a performance but not a functionality impact.

Hope that helps. Is it still odd to zap before checking if conversion
should proceed?

>> +		if (ret)
>> +			goto invalidate_end;
>> +	}
>> +
>> +	list_for_each_entry(work, &work_list, list) {
>> +		rollback_stop_item = work;
>> +		ret = kvm_gmem_shareability_apply(inode, work, m);
>> +		if (ret)
>> +			break;
>> +	}
>> +
>> +	if (ret) {
>> +		m = shared ? SHAREABILITY_GUEST : SHAREABILITY_ALL;
>> +		list_for_each_entry(work, &work_list, list) {
>> +			if (work == rollback_stop_item)
>> +				break;
>> +
>> +			WARN_ON(kvm_gmem_shareability_apply(inode, work, m));
>> +		}
>> +	}
>> +
>> +invalidate_end:
>> +	list_for_each_entry(work, &work_list, list)
>> +		kvm_gmem_convert_invalidate_end(inode, work);
>> +out:
>> +	filemap_invalidate_unlock(inode->i_mapping);
>> +
>> +	list_for_each_entry_safe(work, tmp, &work_list, list) {
>> +		list_del(&work->list);
>> +		kfree(work);
>> +	}
>> +
>> +	return ret;
>> +}
>> +
> [...]
>> @@ -186,15 +490,26 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
>>   	unsigned long index;
>>   
>>   	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
>> +		enum kvm_gfn_range_filter filter;
>>   		pgoff_t pgoff = slot->gmem.pgoff;
>>   
>> +		filter = KVM_FILTER_PRIVATE;
>> +		if (kvm_gmem_memslot_supports_shared(slot)) {
>> +			/*
>> +			 * Unmapping would also cause invalidation, but cannot
>> +			 * rely on mmu_notifiers to do invalidation via
>> +			 * unmapping, since memory may not be mapped to
>> +			 * userspace.
>> +			 */
>> +			filter |= KVM_FILTER_SHARED;
>> +		}
>> +
>>   		struct kvm_gfn_range gfn_range = {
>>   			.start = slot->base_gfn + max(pgoff, start) - pgoff,
>>   			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
>>   			.slot = slot,
>>   			.may_block = true,
>> -			/* guest memfd is relevant to only private mappings. */
>> -			.attr_filter = KVM_FILTER_PRIVATE,
>> +			.attr_filter = filter,
>>   		};
>>   
>>   		if (!found_memslot) {
>> @@ -484,11 +799,49 @@ EXPORT_SYMBOL_GPL(kvm_gmem_memslot_supports_shared);
>>   #define kvm_gmem_mmap NULL
>>   #endif /* CONFIG_KVM_GMEM_SHARED_MEM */
>>   
> [...]
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Binbin Wu 6 months, 2 weeks ago

On 5/31/2025 4:10 AM, Ackerley Tng wrote:
> Binbin Wu <binbin.wu@linux.intel.com> writes:
>
>> On 5/15/2025 7:41 AM, Ackerley Tng wrote:
>>
>> [...]
>>> +
>>> +static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
>>> +				  size_t nr_pages, bool shared,
>>> +				  pgoff_t *error_index)
>>> +{
>>> +	struct conversion_work *work, *tmp, *rollback_stop_item;
>>> +	LIST_HEAD(work_list);
>>> +	struct inode *inode;
>>> +	enum shareability m;
>>> +	int ret;
>>> +
>>> +	inode = file_inode(file);
>>> +
>>> +	filemap_invalidate_lock(inode->i_mapping);
>>> +
>>> +	m = shared ? SHAREABILITY_ALL : SHAREABILITY_GUEST;
>>> +	ret = kvm_gmem_convert_compute_work(inode, start, nr_pages, m, &work_list);
>>> +	if (ret || list_empty(&work_list))
>>> +		goto out;
>>> +
>>> +	list_for_each_entry(work, &work_list, list)
>>> +		kvm_gmem_convert_invalidate_begin(inode, work);
>>> +
>>> +	list_for_each_entry(work, &work_list, list) {
>>> +		ret = kvm_gmem_convert_should_proceed(inode, work, shared,
>>> +						      error_index);
>> Since kvm_gmem_invalidate_begin() begins to handle shared memory,
>> kvm_gmem_convert_invalidate_begin() will zap the table.
>> The shared mapping could be zapped in kvm_gmem_convert_invalidate_begin() even
>> when kvm_gmem_convert_should_proceed() returns error.
>> The sequence is a bit confusing to me, at least in this patch so far.
>>
> It is true that zapping of pages from the guest page table will happen
> before we figure out whether conversion is allowed.
>
> For a shared-to-private conversion, we will definitely unmap from the
> host before checking if conversion is allowed, and there's no choice
> there since conversion is allowed if there are no unexpected refcounts,
> and the way to eliminate expected refcounts is to unmap from the host.
>
> Since we're unmapping before checking if conversion is allowed, I
> thought it would be fine to also zap from guest page tables before
> checking if conversion is allowed.
>
> Conversion is not meant to happen very regularly, and even if it is
> unmapped or zapped, the next access will fault in the page anyway, so
> there is a performance but not a functionality impact.
Yes, it's OK for shared mapping.

>
> Hope that helps.

It helped, thanks!

> Is it still odd to zap before checking if conversion
> should proceed?
>
>>> +		if (ret)
>>> +			goto invalidate_end;
>>> +	}
>>> +
>>> +	list_for_each_entry(work, &work_list, list) {
>>> +		rollback_stop_item = work;
>>> +		ret = kvm_gmem_shareability_apply(inode, work, m);
>>> +		if (ret)
>>> +			break;
>>> +	}
>>> +
>>> +	if (ret) {
>>> +		m = shared ? SHAREABILITY_GUEST : SHAREABILITY_ALL;
>>> +		list_for_each_entry(work, &work_list, list) {
>>> +			if (work == rollback_stop_item)
>>> +				break;
>>> +
>>> +			WARN_ON(kvm_gmem_shareability_apply(inode, work, m));
>>> +		}
>>> +	}
>>> +
>>> +invalidate_end:
>>> +	list_for_each_entry(work, &work_list, list)
>>> +		kvm_gmem_convert_invalidate_end(inode, work);
>>> +out:
>>> +	filemap_invalidate_unlock(inode->i_mapping);
>>> +
>>> +	list_for_each_entry_safe(work, tmp, &work_list, list) {
>>> +		list_del(&work->list);
>>> +		kfree(work);
>>> +	}
>>> +
>>> +	return ret;
>>> +}
>>> +
>> [...]
>>> @@ -186,15 +490,26 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
>>>    	unsigned long index;
>>>    
>>>    	xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
>>> +		enum kvm_gfn_range_filter filter;
>>>    		pgoff_t pgoff = slot->gmem.pgoff;
>>>    
>>> +		filter = KVM_FILTER_PRIVATE;
>>> +		if (kvm_gmem_memslot_supports_shared(slot)) {
>>> +			/*
>>> +			 * Unmapping would also cause invalidation, but cannot
>>> +			 * rely on mmu_notifiers to do invalidation via
>>> +			 * unmapping, since memory may not be mapped to
>>> +			 * userspace.
>>> +			 */
>>> +			filter |= KVM_FILTER_SHARED;
>>> +		}
>>> +
>>>    		struct kvm_gfn_range gfn_range = {
>>>    			.start = slot->base_gfn + max(pgoff, start) - pgoff,
>>>    			.end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
>>>    			.slot = slot,
>>>    			.may_block = true,
>>> -			/* guest memfd is relevant to only private mappings. */
>>> -			.attr_filter = KVM_FILTER_PRIVATE,
>>> +			.attr_filter = filter,
>>>    		};
>>>    
>>>    		if (!found_memslot) {
>>> @@ -484,11 +799,49 @@ EXPORT_SYMBOL_GPL(kvm_gmem_memslot_supports_shared);
>>>    #define kvm_gmem_mmap NULL
>>>    #endif /* CONFIG_KVM_GMEM_SHARED_MEM */
>>>    
>> [...]
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Fuad Tabba 7 months ago
Hi Ackerley,

On Thu, 15 May 2025 at 00:43, Ackerley Tng <ackerleytng@google.com> wrote:
>
> The two new guest_memfd ioctls KVM_GMEM_CONVERT_SHARED and
> KVM_GMEM_CONVERT_PRIVATE convert the requested memory ranges to shared
> and private respectively.

I have a high level question about this particular patch and this
approach for conversion: why do we need IOCTLs to manage conversion
between private and shared?

In the presentations I gave at LPC [1, 2], and in my latest patch
series that performs in-place conversion [3] and the associated (by
now outdated) state diagram [4], I didn't see the need to have a
userspace-facing interface to manage that. KVM has all the information
it needs to handle conversions, which are triggered by the guest. To
me this seems like it adds additional complexity, as well as a user
facing interface that we would need to maintain.

There are various ways we could handle conversion without explicit
interference from userspace. What I had in mind is the following (as
an example, details can vary according to VM type). I will use use the
case of conversion from shared to private because that is the more
complicated (interesting) case:

- Guest issues a hypercall to request that a shared folio become private.

- The hypervisor receives the call, and passes it to KVM.

- KVM unmaps the folio from the guest stage-2 (EPT I think in x86
parlance), and unmaps it from the host. The host however, could still
have references (e.g., GUP).

- KVM exits to the host (hypervisor call exit), with the information
that the folio has been unshared from it.

- A well behaving host would now get rid of all of its references
(e.g., release GUPs), perform a VCPU run, and the guest continues
running as normal. I expect this to be the common case.

But to handle the more interesting situation, let's say that the host
doesn't do it immediately, and for some reason it holds on to some
references to that folio.

- Even if that's the case, the guest can still run *. If the guest
tries to access the folio, KVM detects that access when it tries to
fault it into the guest, sees that the host still has references to
that folio, and exits back to the host with a memory fault exit. At
this point, the VCPU that has tried to fault in that particular folio
cannot continue running as long as it cannot fault in that folio.

- The host tries a VCPU run again, and the above repeats, i.e., KVM
checks the refcount, finds that the host still holds references,
doesn't fault the folio into the guest, and exits back to the host.

- Eventually a well-behaving host releases all its references, and the
following VCPU run is able to fault the page into the guest, and
proceed with running it.

In case the guest is destroyed before that happens, we have the whole
folio_put() callback scenario we had discussed earlier.

In other words, the interface that I had in mind where KVM run exists
(hyp call, memory fault), as well as VCPU run. Both which already
exist, and convey the same information. Is there a case where that
isn't enough or suboptimal?

Thanks,
/fuad

(*) An alternative suggestion was to block the VCPU from running
altogether, regardless of whether it wants to fault the unshared page
immediately, and continually exit to the host until references are
dropped and the conversion can happen.

[1] https://lpc.events/event/17/contributions/1487/
[2] https://lpc.events/event/18/contributions/1758/
[3] https://lore.kernel.org/all/20250328153133.3504118-1-tabba@google.com/
[4] https://lpc.events/event/18/contributions/1758/attachments/1457/3699/Guestmemfd%20folio%20state%20page_type.pdf


> A guest_memfd ioctl is used because shareability is a property of the
> memory, and this property should be modifiable independently of the
> attached struct kvm. This allows shareability to be modified even if
> the memory is not yet bound using memslots.
>
> For shared to private conversions, if refcounts on any of the folios
> within the range are elevated, fail the conversion with -EAGAIN.
>
> At the point of shared to private conversion, all folios in range are
> also unmapped. The filemap_invalidate_lock() is held, so no faulting
> can occur. Hence, from that point on, only transient refcounts can be
> taken on the folios associated with that guest_memfd.
>
> Hence, it is safe to do the conversion from shared to private.
>
> After conversion is complete, refcounts may become elevated, but that
> is fine since users of transient refcounts don't actually access
> memory.
>
> For private to shared conversions, there are no refcount checks. any
> transient refcounts are expected to drop their refcounts soon. The
> conversion process will spin waiting for these transient refcounts to
> go away.
>
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
>
> Change-Id: I3546aaf6c1b795de6dc9ba09e816b64934221918
> ---
>  include/uapi/linux/kvm.h |  11 ++
>  virt/kvm/guest_memfd.c   | 357 ++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 366 insertions(+), 2 deletions(-)
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index d7df312479aa..5b28e17f6f14 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1577,6 +1577,17 @@ struct kvm_create_guest_memfd {
>         __u64 reserved[6];
>  };
>
> +#define KVM_GMEM_IO 0xAF
> +#define KVM_GMEM_CONVERT_SHARED                _IOWR(KVM_GMEM_IO,  0x41, struct kvm_gmem_convert)
> +#define KVM_GMEM_CONVERT_PRIVATE       _IOWR(KVM_GMEM_IO,  0x42, struct kvm_gmem_convert)
> +
> +struct kvm_gmem_convert {
> +       __u64 offset;
> +       __u64 size;
> +       __u64 error_offset;
> +       __u64 reserved[5];
> +};
> +
>  #define KVM_PRE_FAULT_MEMORY   _IOWR(KVMIO, 0xd5, struct kvm_pre_fault_memory)
>
>  struct kvm_pre_fault_memory {
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 590932499eba..f802116290ce 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -30,6 +30,10 @@ enum shareability {
>  };
>
>  static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
> +static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> +                                     pgoff_t end);
> +static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
> +                                   pgoff_t end);
>
>  static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
>  {
> @@ -85,6 +89,306 @@ static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t inde
>         return kvm_gmem_get_folio(inode, index);
>  }
>
> +/**
> + * kvm_gmem_shareability_store() - Sets shareability to @value for range.
> + *
> + * @mt: the shareability maple tree.
> + * @index: the range begins at this index in the inode.
> + * @nr_pages: number of PAGE_SIZE pages in this range.
> + * @value: the shareability value to set for this range.
> + *
> + * Unlike mtree_store_range(), this function also merges adjacent ranges that
> + * have the same values as an optimization. Assumes that all stores to @mt go
> + * through this function, such that adjacent ranges are always merged.
> + *
> + * Return: 0 on success and negative error otherwise.
> + */
> +static int kvm_gmem_shareability_store(struct maple_tree *mt, pgoff_t index,
> +                                      size_t nr_pages, enum shareability value)
> +{
> +       MA_STATE(mas, mt, 0, 0);
> +       unsigned long start;
> +       unsigned long last;
> +       void *entry;
> +       int ret;
> +
> +       start = index;
> +       last = start + nr_pages - 1;
> +
> +       mas_lock(&mas);
> +
> +       /* Try extending range. entry is NULL on overflow/wrap-around. */
> +       mas_set_range(&mas, last + 1, last + 1);
> +       entry = mas_find(&mas, last + 1);
> +       if (entry && xa_to_value(entry) == value)
> +               last = mas.last;
> +
> +       mas_set_range(&mas, start - 1, start - 1);
> +       entry = mas_find(&mas, start - 1);
> +       if (entry && xa_to_value(entry) == value)
> +               start = mas.index;
> +
> +       mas_set_range(&mas, start, last);
> +       ret = mas_store_gfp(&mas, xa_mk_value(value), GFP_KERNEL);
> +
> +       mas_unlock(&mas);
> +
> +       return ret;
> +}
> +
> +struct conversion_work {
> +       struct list_head list;
> +       pgoff_t start;
> +       size_t nr_pages;
> +};
> +
> +static int add_to_work_list(struct list_head *list, pgoff_t start, pgoff_t last)
> +{
> +       struct conversion_work *work;
> +
> +       work = kzalloc(sizeof(*work), GFP_KERNEL);
> +       if (!work)
> +               return -ENOMEM;
> +
> +       work->start = start;
> +       work->nr_pages = last + 1 - start;
> +
> +       list_add_tail(&work->list, list);
> +
> +       return 0;
> +}
> +
> +static bool kvm_gmem_has_safe_refcount(struct address_space *mapping, pgoff_t start,
> +                                      size_t nr_pages, pgoff_t *error_index)
> +{
> +       const int filemap_get_folios_refcount = 1;
> +       struct folio_batch fbatch;
> +       bool refcount_safe;
> +       pgoff_t last;
> +       int i;
> +
> +       last = start + nr_pages - 1;
> +       refcount_safe = true;
> +
> +       folio_batch_init(&fbatch);
> +       while (refcount_safe &&
> +              filemap_get_folios(mapping, &start, last, &fbatch)) {
> +
> +               for (i = 0; i < folio_batch_count(&fbatch); ++i) {
> +                       int filemap_refcount;
> +                       int safe_refcount;
> +                       struct folio *f;
> +
> +                       f = fbatch.folios[i];
> +                       filemap_refcount = folio_nr_pages(f);
> +
> +                       safe_refcount = filemap_refcount + filemap_get_folios_refcount;
> +                       if (folio_ref_count(f) != safe_refcount) {
> +                               refcount_safe = false;
> +                               *error_index = f->index;
> +                               break;
> +                       }
> +               }
> +
> +               folio_batch_release(&fbatch);
> +       }
> +
> +       return refcount_safe;
> +}
> +
> +static int kvm_gmem_shareability_apply(struct inode *inode,
> +                                      struct conversion_work *work,
> +                                      enum shareability m)
> +{
> +       struct maple_tree *mt;
> +
> +       mt = &kvm_gmem_private(inode)->shareability;
> +       return kvm_gmem_shareability_store(mt, work->start, work->nr_pages, m);
> +}
> +
> +static int kvm_gmem_convert_compute_work(struct inode *inode, pgoff_t start,
> +                                        size_t nr_pages, enum shareability m,
> +                                        struct list_head *work_list)
> +{
> +       struct maple_tree *mt;
> +       struct ma_state mas;
> +       pgoff_t last;
> +       void *entry;
> +       int ret;
> +
> +       last = start + nr_pages - 1;
> +
> +       mt = &kvm_gmem_private(inode)->shareability;
> +       ret = 0;
> +
> +       mas_init(&mas, mt, start);
> +
> +       rcu_read_lock();
> +       mas_for_each(&mas, entry, last) {
> +               enum shareability current_m;
> +               pgoff_t m_range_index;
> +               pgoff_t m_range_last;
> +               int ret;
> +
> +               m_range_index = max(mas.index, start);
> +               m_range_last = min(mas.last, last);
> +
> +               current_m = xa_to_value(entry);
> +               if (m == current_m)
> +                       continue;
> +
> +               mas_pause(&mas);
> +               rcu_read_unlock();
> +               /* Caller will clean this up on error. */
> +               ret = add_to_work_list(work_list, m_range_index, m_range_last);
> +               rcu_read_lock();
> +               if (ret)
> +                       break;
> +       }
> +       rcu_read_unlock();
> +
> +       return ret;
> +}
> +
> +static void kvm_gmem_convert_invalidate_begin(struct inode *inode,
> +                                             struct conversion_work *work)
> +{
> +       struct list_head *gmem_list;
> +       struct kvm_gmem *gmem;
> +       pgoff_t end;
> +
> +       end = work->start + work->nr_pages;
> +
> +       gmem_list = &inode->i_mapping->i_private_list;
> +       list_for_each_entry(gmem, gmem_list, entry)
> +               kvm_gmem_invalidate_begin(gmem, work->start, end);
> +}
> +
> +static void kvm_gmem_convert_invalidate_end(struct inode *inode,
> +                                           struct conversion_work *work)
> +{
> +       struct list_head *gmem_list;
> +       struct kvm_gmem *gmem;
> +       pgoff_t end;
> +
> +       end = work->start + work->nr_pages;
> +
> +       gmem_list = &inode->i_mapping->i_private_list;
> +       list_for_each_entry(gmem, gmem_list, entry)
> +               kvm_gmem_invalidate_end(gmem, work->start, end);
> +}
> +
> +static int kvm_gmem_convert_should_proceed(struct inode *inode,
> +                                          struct conversion_work *work,
> +                                          bool to_shared, pgoff_t *error_index)
> +{
> +       if (!to_shared) {
> +               unmap_mapping_pages(inode->i_mapping, work->start,
> +                                   work->nr_pages, false);
> +
> +               if (!kvm_gmem_has_safe_refcount(inode->i_mapping, work->start,
> +                                               work->nr_pages, error_index)) {
> +                       return -EAGAIN;
> +               }
> +       }
> +
> +       return 0;
> +}
> +
> +static int kvm_gmem_convert_range(struct file *file, pgoff_t start,
> +                                 size_t nr_pages, bool shared,
> +                                 pgoff_t *error_index)
> +{
> +       struct conversion_work *work, *tmp, *rollback_stop_item;
> +       LIST_HEAD(work_list);
> +       struct inode *inode;
> +       enum shareability m;
> +       int ret;
> +
> +       inode = file_inode(file);
> +
> +       filemap_invalidate_lock(inode->i_mapping);
> +
> +       m = shared ? SHAREABILITY_ALL : SHAREABILITY_GUEST;
> +       ret = kvm_gmem_convert_compute_work(inode, start, nr_pages, m, &work_list);
> +       if (ret || list_empty(&work_list))
> +               goto out;
> +
> +       list_for_each_entry(work, &work_list, list)
> +               kvm_gmem_convert_invalidate_begin(inode, work);
> +
> +       list_for_each_entry(work, &work_list, list) {
> +               ret = kvm_gmem_convert_should_proceed(inode, work, shared,
> +                                                     error_index);
> +               if (ret)
> +                       goto invalidate_end;
> +       }
> +
> +       list_for_each_entry(work, &work_list, list) {
> +               rollback_stop_item = work;
> +               ret = kvm_gmem_shareability_apply(inode, work, m);
> +               if (ret)
> +                       break;
> +       }
> +
> +       if (ret) {
> +               m = shared ? SHAREABILITY_GUEST : SHAREABILITY_ALL;
> +               list_for_each_entry(work, &work_list, list) {
> +                       if (work == rollback_stop_item)
> +                               break;
> +
> +                       WARN_ON(kvm_gmem_shareability_apply(inode, work, m));
> +               }
> +       }
> +
> +invalidate_end:
> +       list_for_each_entry(work, &work_list, list)
> +               kvm_gmem_convert_invalidate_end(inode, work);
> +out:
> +       filemap_invalidate_unlock(inode->i_mapping);
> +
> +       list_for_each_entry_safe(work, tmp, &work_list, list) {
> +               list_del(&work->list);
> +               kfree(work);
> +       }
> +
> +       return ret;
> +}
> +
> +static int kvm_gmem_ioctl_convert_range(struct file *file,
> +                                       struct kvm_gmem_convert *param,
> +                                       bool shared)
> +{
> +       pgoff_t error_index;
> +       size_t nr_pages;
> +       pgoff_t start;
> +       int ret;
> +
> +       if (param->error_offset)
> +               return -EINVAL;
> +
> +       if (param->size == 0)
> +               return 0;
> +
> +       if (param->offset + param->size < param->offset ||
> +           param->offset > file_inode(file)->i_size ||
> +           param->offset + param->size > file_inode(file)->i_size)
> +               return -EINVAL;
> +
> +       if (!IS_ALIGNED(param->offset, PAGE_SIZE) ||
> +           !IS_ALIGNED(param->size, PAGE_SIZE))
> +               return -EINVAL;
> +
> +       start = param->offset >> PAGE_SHIFT;
> +       nr_pages = param->size >> PAGE_SHIFT;
> +
> +       ret = kvm_gmem_convert_range(file, start, nr_pages, shared, &error_index);
> +       if (ret)
> +               param->error_offset = error_index << PAGE_SHIFT;
> +
> +       return ret;
> +}
> +
>  #else
>
>  static int kvm_gmem_shareability_setup(struct maple_tree *mt, loff_t size, u64 flags)
> @@ -186,15 +490,26 @@ static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
>         unsigned long index;
>
>         xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> +               enum kvm_gfn_range_filter filter;
>                 pgoff_t pgoff = slot->gmem.pgoff;
>
> +               filter = KVM_FILTER_PRIVATE;
> +               if (kvm_gmem_memslot_supports_shared(slot)) {
> +                       /*
> +                        * Unmapping would also cause invalidation, but cannot
> +                        * rely on mmu_notifiers to do invalidation via
> +                        * unmapping, since memory may not be mapped to
> +                        * userspace.
> +                        */
> +                       filter |= KVM_FILTER_SHARED;
> +               }
> +
>                 struct kvm_gfn_range gfn_range = {
>                         .start = slot->base_gfn + max(pgoff, start) - pgoff,
>                         .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
>                         .slot = slot,
>                         .may_block = true,
> -                       /* guest memfd is relevant to only private mappings. */
> -                       .attr_filter = KVM_FILTER_PRIVATE,
> +                       .attr_filter = filter,
>                 };
>
>                 if (!found_memslot) {
> @@ -484,11 +799,49 @@ EXPORT_SYMBOL_GPL(kvm_gmem_memslot_supports_shared);
>  #define kvm_gmem_mmap NULL
>  #endif /* CONFIG_KVM_GMEM_SHARED_MEM */
>
> +static long kvm_gmem_ioctl(struct file *file, unsigned int ioctl,
> +                          unsigned long arg)
> +{
> +       void __user *argp;
> +       int r;
> +
> +       argp = (void __user *)arg;
> +
> +       switch (ioctl) {
> +#ifdef CONFIG_KVM_GMEM_SHARED_MEM
> +       case KVM_GMEM_CONVERT_SHARED:
> +       case KVM_GMEM_CONVERT_PRIVATE: {
> +               struct kvm_gmem_convert param;
> +               bool to_shared;
> +
> +               r = -EFAULT;
> +               if (copy_from_user(&param, argp, sizeof(param)))
> +                       goto out;
> +
> +               to_shared = ioctl == KVM_GMEM_CONVERT_SHARED;
> +               r = kvm_gmem_ioctl_convert_range(file, &param, to_shared);
> +               if (r) {
> +                       if (copy_to_user(argp, &param, sizeof(param))) {
> +                               r = -EFAULT;
> +                               goto out;
> +                       }
> +               }
> +               break;
> +       }
> +#endif
> +       default:
> +               r = -ENOTTY;
> +       }
> +out:
> +       return r;
> +}
> +
>  static struct file_operations kvm_gmem_fops = {
>         .mmap           = kvm_gmem_mmap,
>         .open           = generic_file_open,
>         .release        = kvm_gmem_release,
>         .fallocate      = kvm_gmem_fallocate,
> +       .unlocked_ioctl = kvm_gmem_ioctl,
>  };
>
>  static void kvm_gmem_free_inode(struct inode *inode)
> --
> 2.49.0.1045.g170613ef41-goog
>
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Vishal Annapurve 7 months ago
On Tue, May 20, 2025 at 2:23 AM Fuad Tabba <tabba@google.com> wrote:
>
> Hi Ackerley,
>
> On Thu, 15 May 2025 at 00:43, Ackerley Tng <ackerleytng@google.com> wrote:
> >
> > The two new guest_memfd ioctls KVM_GMEM_CONVERT_SHARED and
> > KVM_GMEM_CONVERT_PRIVATE convert the requested memory ranges to shared
> > and private respectively.
>
> I have a high level question about this particular patch and this
> approach for conversion: why do we need IOCTLs to manage conversion
> between private and shared?
>
> In the presentations I gave at LPC [1, 2], and in my latest patch
> series that performs in-place conversion [3] and the associated (by
> now outdated) state diagram [4], I didn't see the need to have a
> userspace-facing interface to manage that. KVM has all the information
> it needs to handle conversions, which are triggered by the guest. To
> me this seems like it adds additional complexity, as well as a user
> facing interface that we would need to maintain.
>
> There are various ways we could handle conversion without explicit
> interference from userspace. What I had in mind is the following (as
> an example, details can vary according to VM type). I will use use the
> case of conversion from shared to private because that is the more
> complicated (interesting) case:
>
> - Guest issues a hypercall to request that a shared folio become private.
>
> - The hypervisor receives the call, and passes it to KVM.
>
> - KVM unmaps the folio from the guest stage-2 (EPT I think in x86
> parlance), and unmaps it from the host. The host however, could still
> have references (e.g., GUP).
>
> - KVM exits to the host (hypervisor call exit), with the information
> that the folio has been unshared from it.
>
> - A well behaving host would now get rid of all of its references
> (e.g., release GUPs), perform a VCPU run, and the guest continues
> running as normal. I expect this to be the common case.
>
> But to handle the more interesting situation, let's say that the host
> doesn't do it immediately, and for some reason it holds on to some
> references to that folio.
>
> - Even if that's the case, the guest can still run *. If the guest
> tries to access the folio, KVM detects that access when it tries to
> fault it into the guest, sees that the host still has references to
> that folio, and exits back to the host with a memory fault exit. At
> this point, the VCPU that has tried to fault in that particular folio
> cannot continue running as long as it cannot fault in that folio.

Are you talking about the following scheme?
1) guest_memfd checks shareability on each get pfn and if there is a
mismatch exit to the host.
2) host user space has to guess whether it's a pending refcount or
whether it's an actual mismatch.
3) guest_memfd will maintain a third state
"pending_private_conversion" or equivalent which will transition to
private upon the last refcount drop of each page.

If conversion is triggered by userspace (in case of pKVM, it will be
triggered from within the KVM (?)):
* Conversion will just fail if there are extra refcounts and userspace
can try to get rid of extra refcounts on the range while it has enough
context without hitting any ambiguity with memory fault exit.
* guest_memfd will not have to deal with this extra state from 3 above
and overall guest_memfd conversion handling becomes relatively
simpler.

Note that for x86 CoCo cases, memory conversion is already triggered
by userspace using KVM ioctl, this series is proposing to use
guest_memfd ioctl to do the same.
 - Allows not having to keep track of separate shared/private range
information in KVM.
 - Simpler handling of the conversion process done per guest_memfd
rather than for full range.
     - Userspace can handle the rollback as needed, simplifying error
handling in guest_memfd.
 - guest_memfd is single source of truth and notifies the users of
shareability change.
     - e.g. IOMMU, userspace, KVM MMU all can be registered for
getting notifications from guest_memfd directly and will get notified
for invalidation upon shareability attribute updates.
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Fuad Tabba 7 months ago
Hi Vishal,

On Tue, 20 May 2025 at 14:02, Vishal Annapurve <vannapurve@google.com> wrote:
>
> On Tue, May 20, 2025 at 2:23 AM Fuad Tabba <tabba@google.com> wrote:
> >
> > Hi Ackerley,
> >
> > On Thu, 15 May 2025 at 00:43, Ackerley Tng <ackerleytng@google.com> wrote:
> > >
> > > The two new guest_memfd ioctls KVM_GMEM_CONVERT_SHARED and
> > > KVM_GMEM_CONVERT_PRIVATE convert the requested memory ranges to shared
> > > and private respectively.
> >
> > I have a high level question about this particular patch and this
> > approach for conversion: why do we need IOCTLs to manage conversion
> > between private and shared?
> >
> > In the presentations I gave at LPC [1, 2], and in my latest patch
> > series that performs in-place conversion [3] and the associated (by
> > now outdated) state diagram [4], I didn't see the need to have a
> > userspace-facing interface to manage that. KVM has all the information
> > it needs to handle conversions, which are triggered by the guest. To
> > me this seems like it adds additional complexity, as well as a user
> > facing interface that we would need to maintain.
> >
> > There are various ways we could handle conversion without explicit
> > interference from userspace. What I had in mind is the following (as
> > an example, details can vary according to VM type). I will use use the
> > case of conversion from shared to private because that is the more
> > complicated (interesting) case:
> >
> > - Guest issues a hypercall to request that a shared folio become private.
> >
> > - The hypervisor receives the call, and passes it to KVM.
> >
> > - KVM unmaps the folio from the guest stage-2 (EPT I think in x86
> > parlance), and unmaps it from the host. The host however, could still
> > have references (e.g., GUP).
> >
> > - KVM exits to the host (hypervisor call exit), with the information
> > that the folio has been unshared from it.
> >
> > - A well behaving host would now get rid of all of its references
> > (e.g., release GUPs), perform a VCPU run, and the guest continues
> > running as normal. I expect this to be the common case.
> >
> > But to handle the more interesting situation, let's say that the host
> > doesn't do it immediately, and for some reason it holds on to some
> > references to that folio.
> >
> > - Even if that's the case, the guest can still run *. If the guest
> > tries to access the folio, KVM detects that access when it tries to
> > fault it into the guest, sees that the host still has references to
> > that folio, and exits back to the host with a memory fault exit. At
> > this point, the VCPU that has tried to fault in that particular folio
> > cannot continue running as long as it cannot fault in that folio.
>
> Are you talking about the following scheme?
> 1) guest_memfd checks shareability on each get pfn and if there is a
> mismatch exit to the host.

I think we are not really on the same page here (no pun intended :) ).
I'll try to answer your questions anyway...

Which get_pfn? Are you referring to get_pfn when faulting the page
into the guest or into the host?

> 2) host user space has to guess whether it's a pending refcount or
> whether it's an actual mismatch.

No need to guess. VCPU run will let it know exactly why it's exiting.

> 3) guest_memfd will maintain a third state
> "pending_private_conversion" or equivalent which will transition to
> private upon the last refcount drop of each page.
>
> If conversion is triggered by userspace (in case of pKVM, it will be
> triggered from within the KVM (?)):

Why would conversion be triggered by userspace? As far as I know, it's
the guest that triggers the conversion.

> * Conversion will just fail if there are extra refcounts and userspace
> can try to get rid of extra refcounts on the range while it has enough
> context without hitting any ambiguity with memory fault exit.
> * guest_memfd will not have to deal with this extra state from 3 above
> and overall guest_memfd conversion handling becomes relatively
> simpler.

That's not really related. The extra state isn't necessary any more
once we agreed in the previous discussion that we will retry instead.

> Note that for x86 CoCo cases, memory conversion is already triggered
> by userspace using KVM ioctl, this series is proposing to use
> guest_memfd ioctl to do the same.

The reason why for x86 CoCo cases conversion is already triggered by
userspace using KVM ioctl is that it has to, since shared memory and
private memory are two separate pages, and userspace needs to manage
that. Sharing memory in place removes the need for that.

This series isn't using the same ioctl, it's introducing new ones to
perform a task that as far as I can tell so far, KVM can handle by
itself.

>  - Allows not having to keep track of separate shared/private range
> information in KVM.

This patch series is already tracking shared/private range information in KVM.

>  - Simpler handling of the conversion process done per guest_memfd
> rather than for full range.
>      - Userspace can handle the rollback as needed, simplifying error
> handling in guest_memfd.
>  - guest_memfd is single source of truth and notifies the users of
> shareability change.
>      - e.g. IOMMU, userspace, KVM MMU all can be registered for
> getting notifications from guest_memfd directly and will get notified
> for invalidation upon shareability attribute updates.

All of these can still be done without introducing a new ioctl.

Cheers,
/fuad
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Vishal Annapurve 7 months ago
On Tue, May 20, 2025 at 6:44 AM Fuad Tabba <tabba@google.com> wrote:
>
> Hi Vishal,
>
> On Tue, 20 May 2025 at 14:02, Vishal Annapurve <vannapurve@google.com> wrote:
> >
> > On Tue, May 20, 2025 at 2:23 AM Fuad Tabba <tabba@google.com> wrote:
> > >
> > > Hi Ackerley,
> > >
> > > On Thu, 15 May 2025 at 00:43, Ackerley Tng <ackerleytng@google.com> wrote:
> > > >
> > > > The two new guest_memfd ioctls KVM_GMEM_CONVERT_SHARED and
> > > > KVM_GMEM_CONVERT_PRIVATE convert the requested memory ranges to shared
> > > > and private respectively.
> > >
> > > I have a high level question about this particular patch and this
> > > approach for conversion: why do we need IOCTLs to manage conversion
> > > between private and shared?
> > >
> > > In the presentations I gave at LPC [1, 2], and in my latest patch
> > > series that performs in-place conversion [3] and the associated (by
> > > now outdated) state diagram [4], I didn't see the need to have a
> > > userspace-facing interface to manage that. KVM has all the information
> > > it needs to handle conversions, which are triggered by the guest. To
> > > me this seems like it adds additional complexity, as well as a user
> > > facing interface that we would need to maintain.
> > >
> > > There are various ways we could handle conversion without explicit
> > > interference from userspace. What I had in mind is the following (as
> > > an example, details can vary according to VM type). I will use use the
> > > case of conversion from shared to private because that is the more
> > > complicated (interesting) case:
> > >
> > > - Guest issues a hypercall to request that a shared folio become private.
> > >
> > > - The hypervisor receives the call, and passes it to KVM.
> > >
> > > - KVM unmaps the folio from the guest stage-2 (EPT I think in x86
> > > parlance), and unmaps it from the host. The host however, could still
> > > have references (e.g., GUP).
> > >
> > > - KVM exits to the host (hypervisor call exit), with the information
> > > that the folio has been unshared from it.
> > >
> > > - A well behaving host would now get rid of all of its references
> > > (e.g., release GUPs), perform a VCPU run, and the guest continues
> > > running as normal. I expect this to be the common case.
> > >
> > > But to handle the more interesting situation, let's say that the host
> > > doesn't do it immediately, and for some reason it holds on to some
> > > references to that folio.
> > >
> > > - Even if that's the case, the guest can still run *. If the guest
> > > tries to access the folio, KVM detects that access when it tries to
> > > fault it into the guest, sees that the host still has references to
> > > that folio, and exits back to the host with a memory fault exit. At
> > > this point, the VCPU that has tried to fault in that particular folio
> > > cannot continue running as long as it cannot fault in that folio.
> >
> > Are you talking about the following scheme?
> > 1) guest_memfd checks shareability on each get pfn and if there is a
> > mismatch exit to the host.
>
> I think we are not really on the same page here (no pun intended :) ).
> I'll try to answer your questions anyway...
>
> Which get_pfn? Are you referring to get_pfn when faulting the page
> into the guest or into the host?

I am referring to guest fault handling in KVM.

>
> > 2) host user space has to guess whether it's a pending refcount or
> > whether it's an actual mismatch.
>
> No need to guess. VCPU run will let it know exactly why it's exiting.
>
> > 3) guest_memfd will maintain a third state
> > "pending_private_conversion" or equivalent which will transition to
> > private upon the last refcount drop of each page.
> >
> > If conversion is triggered by userspace (in case of pKVM, it will be
> > triggered from within the KVM (?)):
>
> Why would conversion be triggered by userspace? As far as I know, it's
> the guest that triggers the conversion.
>
> > * Conversion will just fail if there are extra refcounts and userspace
> > can try to get rid of extra refcounts on the range while it has enough
> > context without hitting any ambiguity with memory fault exit.
> > * guest_memfd will not have to deal with this extra state from 3 above
> > and overall guest_memfd conversion handling becomes relatively
> > simpler.
>
> That's not really related. The extra state isn't necessary any more
> once we agreed in the previous discussion that we will retry instead.

Who is *we* here? Which entity will retry conversion?

>
> > Note that for x86 CoCo cases, memory conversion is already triggered
> > by userspace using KVM ioctl, this series is proposing to use
> > guest_memfd ioctl to do the same.
>
> The reason why for x86 CoCo cases conversion is already triggered by
> userspace using KVM ioctl is that it has to, since shared memory and
> private memory are two separate pages, and userspace needs to manage
> that. Sharing memory in place removes the need for that.

Userspace still needs to clean up memory usage before conversion is
successful. e.g. remove IOMMU mappings for shared to private
conversion. I would think that memory conversion should not succeed
before all existing users let go of the guest_memfd pages for the
range being converted.

In x86 CoCo usecases, userspace can also decide to not allow
conversion for scenarios where ranges are still under active use by
the host and guest is erroneously trying to take away memory. Both
SNP/TDX spec allow failure of conversion due to in use memory.

>
> This series isn't using the same ioctl, it's introducing new ones to
> perform a task that as far as I can tell so far, KVM can handle by
> itself.

I would like to understand this better. How will KVM handle the
conversion process for guest_memfd pages? Can you help walk an example
sequence for shared to private conversion specifically around
guest_memfd offset states?

>
> >  - Allows not having to keep track of separate shared/private range
> > information in KVM.
>
> This patch series is already tracking shared/private range information in KVM.
>
> >  - Simpler handling of the conversion process done per guest_memfd
> > rather than for full range.
> >      - Userspace can handle the rollback as needed, simplifying error
> > handling in guest_memfd.
> >  - guest_memfd is single source of truth and notifies the users of
> > shareability change.
> >      - e.g. IOMMU, userspace, KVM MMU all can be registered for
> > getting notifications from guest_memfd directly and will get notified
> > for invalidation upon shareability attribute updates.
>
> All of these can still be done without introducing a new ioctl.
>
> Cheers,
> /fuad
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Alexey Kardashevskiy 5 months, 3 weeks ago
On 21/5/25 00:11, Vishal Annapurve wrote:
> On Tue, May 20, 2025 at 6:44 AM Fuad Tabba <tabba@google.com> wrote:
>>
>> Hi Vishal,
>>
>> On Tue, 20 May 2025 at 14:02, Vishal Annapurve <vannapurve@google.com> wrote:
>>>
>>> On Tue, May 20, 2025 at 2:23 AM Fuad Tabba <tabba@google.com> wrote:
>>>>
>>>> Hi Ackerley,
>>>>
>>>> On Thu, 15 May 2025 at 00:43, Ackerley Tng <ackerleytng@google.com> wrote:
>>>>>
>>>>> The two new guest_memfd ioctls KVM_GMEM_CONVERT_SHARED and
>>>>> KVM_GMEM_CONVERT_PRIVATE convert the requested memory ranges to shared
>>>>> and private respectively.
>>>>
>>>> I have a high level question about this particular patch and this
>>>> approach for conversion: why do we need IOCTLs to manage conversion
>>>> between private and shared?
>>>>
>>>> In the presentations I gave at LPC [1, 2], and in my latest patch
>>>> series that performs in-place conversion [3] and the associated (by
>>>> now outdated) state diagram [4], I didn't see the need to have a
>>>> userspace-facing interface to manage that. KVM has all the information
>>>> it needs to handle conversions, which are triggered by the guest. To
>>>> me this seems like it adds additional complexity, as well as a user
>>>> facing interface that we would need to maintain.
>>>>
>>>> There are various ways we could handle conversion without explicit
>>>> interference from userspace. What I had in mind is the following (as
>>>> an example, details can vary according to VM type). I will use use the
>>>> case of conversion from shared to private because that is the more
>>>> complicated (interesting) case:
>>>>
>>>> - Guest issues a hypercall to request that a shared folio become private.
>>>>
>>>> - The hypervisor receives the call, and passes it to KVM.
>>>>
>>>> - KVM unmaps the folio from the guest stage-2 (EPT I think in x86
>>>> parlance), and unmaps it from the host. The host however, could still
>>>> have references (e.g., GUP).
>>>>
>>>> - KVM exits to the host (hypervisor call exit), with the information
>>>> that the folio has been unshared from it.
>>>>
>>>> - A well behaving host would now get rid of all of its references
>>>> (e.g., release GUPs), perform a VCPU run, and the guest continues
>>>> running as normal. I expect this to be the common case.
>>>>
>>>> But to handle the more interesting situation, let's say that the host
>>>> doesn't do it immediately, and for some reason it holds on to some
>>>> references to that folio.
>>>>
>>>> - Even if that's the case, the guest can still run *. If the guest
>>>> tries to access the folio, KVM detects that access when it tries to
>>>> fault it into the guest, sees that the host still has references to
>>>> that folio, and exits back to the host with a memory fault exit. At
>>>> this point, the VCPU that has tried to fault in that particular folio
>>>> cannot continue running as long as it cannot fault in that folio.
>>>
>>> Are you talking about the following scheme?
>>> 1) guest_memfd checks shareability on each get pfn and if there is a
>>> mismatch exit to the host.
>>
>> I think we are not really on the same page here (no pun intended :) ).
>> I'll try to answer your questions anyway...
>>
>> Which get_pfn? Are you referring to get_pfn when faulting the page
>> into the guest or into the host?
> 
> I am referring to guest fault handling in KVM.
> 
>>
>>> 2) host user space has to guess whether it's a pending refcount or
>>> whether it's an actual mismatch.
>>
>> No need to guess. VCPU run will let it know exactly why it's exiting.
>>
>>> 3) guest_memfd will maintain a third state
>>> "pending_private_conversion" or equivalent which will transition to
>>> private upon the last refcount drop of each page.
>>>
>>> If conversion is triggered by userspace (in case of pKVM, it will be
>>> triggered from within the KVM (?)):
>>
>> Why would conversion be triggered by userspace? As far as I know, it's
>> the guest that triggers the conversion.
>>
>>> * Conversion will just fail if there are extra refcounts and userspace
>>> can try to get rid of extra refcounts on the range while it has enough
>>> context without hitting any ambiguity with memory fault exit.
>>> * guest_memfd will not have to deal with this extra state from 3 above
>>> and overall guest_memfd conversion handling becomes relatively
>>> simpler.
>>
>> That's not really related. The extra state isn't necessary any more
>> once we agreed in the previous discussion that we will retry instead.
> 
> Who is *we* here? Which entity will retry conversion?
> 
>>
>>> Note that for x86 CoCo cases, memory conversion is already triggered
>>> by userspace using KVM ioctl, this series is proposing to use
>>> guest_memfd ioctl to do the same.
>>
>> The reason why for x86 CoCo cases conversion is already triggered by
>> userspace using KVM ioctl is that it has to, since shared memory and
>> private memory are two separate pages, and userspace needs to manage
>> that. Sharing memory in place removes the need for that.
> 
> Userspace still needs to clean up memory usage before conversion is
> successful. e.g. remove IOMMU mappings for shared to private
> conversion. I would think that memory conversion should not succeed
> before all existing users let go of the guest_memfd pages for the
> range being converted.


Ah about that. Actually IOMMU mappings can remain the same in a case like mine TSM+VFIO RFC based on the Fuad's older patches, here in particular:

https://lore.kernel.org/r/20250218111017.491719-13-aik@amd.com

which works nicely - mapped it once and forgot.

Now, I am rebasing my RFC on top of this patchset and it fails in kvm_gmem_has_safe_refcount() as IOMMU holds references to all these folios in my RFC.

So what is the expected sequence here? The userspace unmaps a DMA page and maps it back right away, all from the userspace? The end result will be the exactly same which seems useless. And IOMMU TLB is going to be flushed on a page conversion anyway (the RMPUPDATE instruction does that). All this is about AMD's x86 though.

For now (and for fun^wexperiment) I disabled kvm_gmem_has_safe_refcount() (04/51 adds it) and it seems to have no effect untiiil memfd is closed - folios_put_refs() crashes in list_del(&folio->lru). I wonder now what direction to go from here.

My TSM+VFIO RFC uses the hw ability to DMA to/from Coco VM (==AMD SEV SNP VM), both private and shared DMA at the same time is going to be allowed. Thanks,



> In x86 CoCo usecases, userspace can also decide to not allow
> conversion for scenarios where ranges are still under active use by
> the host and guest is erroneously trying to take away memory. Both
> SNP/TDX spec allow failure of conversion due to in use memory.
> 
>>
>> This series isn't using the same ioctl, it's introducing new ones to
>> perform a task that as far as I can tell so far, KVM can handle by
>> itself.
> 
> I would like to understand this better. How will KVM handle the
> conversion process for guest_memfd pages? Can you help walk an example
> sequence for shared to private conversion specifically around
> guest_memfd offset states?
> 
>>
>>>   - Allows not having to keep track of separate shared/private range
>>> information in KVM.
>>
>> This patch series is already tracking shared/private range information in KVM.
>>
>>>   - Simpler handling of the conversion process done per guest_memfd
>>> rather than for full range.
>>>       - Userspace can handle the rollback as needed, simplifying error
>>> handling in guest_memfd.
>>>   - guest_memfd is single source of truth and notifies the users of
>>> shareability change.
>>>       - e.g. IOMMU, userspace, KVM MMU all can be registered for
>>> getting notifications from guest_memfd directly and will get notified
>>> for invalidation upon shareability attribute updates.
>>
>> All of these can still be done without introducing a new ioctl.
>>
>> Cheers,
>> /fuad

-- 
Alexey

Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Jason Gunthorpe 5 months, 3 weeks ago
On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:

> Now, I am rebasing my RFC on top of this patchset and it fails in
> kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
> folios in my RFC.
> 
> So what is the expected sequence here? The userspace unmaps a DMA
> page and maps it back right away, all from the userspace? The end
> result will be the exactly same which seems useless. And IOMMU TLB
> is going to be flushed on a page conversion anyway (the RMPUPDATE
> instruction does that). All this is about AMD's x86 though.

The iommu should not be using the VMA to manage the mapping. It should
be directly linked to the guestmemfd in some way that does not disturb
its operations. I imagine there would be some kind of invalidation
callback directly to the iommu.

Presumably that invalidation call back can include a reason for the
invalidation (addr change, shared/private conversion, etc)

I'm not sure how we will figure out which case is which but guestmemfd
should allow the iommu to plug in either invalidation scheme..

Probably invalidation should be a global to the FD thing, I imagine
that once invalidation is established the iommu will not be
incrementing page refcounts.

Jason
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Vishal Annapurve 5 months, 3 weeks ago
On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
>
> > Now, I am rebasing my RFC on top of this patchset and it fails in
> > kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
> > folios in my RFC.
> >
> > So what is the expected sequence here? The userspace unmaps a DMA
> > page and maps it back right away, all from the userspace? The end
> > result will be the exactly same which seems useless. And IOMMU TLB

 As Jason described, ideally IOMMU just like KVM, should just:
1) Directly rely on guest_memfd for pinning -> no page refcounts taken
by IOMMU stack
2) Directly query pfns from guest_memfd for both shared/private ranges
3) Implement an invalidation callback that guest_memfd can invoke on
conversions.

Current flow:
Private to Shared conversion via kvm_gmem_convert_range() -
    1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
on each bound memslot overlapping with the range
         -> KVM has the concept of invalidation_begin() and end(),
which effectively ensures that between these function calls, no new
EPT/NPT entries can be added for the range.
     2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
actually unmaps the KVM SEPT/NPT entries.
     3) guest_memfd invokes kvm_gmem_execute_work() which updates the
shareability and then splits the folios if needed

Shared to private conversion via kvm_gmem_convert_range() -
    1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
on each bound memslot overlapping with the range
     2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
actually unmaps the host mappings which will unmap the KVM non-seucure
EPT/NPT entries.
     3) guest_memfd invokes kvm_gmem_execute_work() which updates the
shareability and then merges the folios if needed.

============================

For IOMMU, could something like below work?

* A new UAPI to bind IOMMU FDs with guest_memfd ranges
* VFIO_DMA_MAP/UNMAP operations modified to directly fetch pfns from
guest_memfd ranges using kvm_gmem_get_pfn()
    -> kvm invokes kvm_gmem_is_private() to check for the range
shareability, IOMMU could use the same or we could add an API in gmem
that takes in access type and checks the shareability before returning
the pfn.
* IOMMU stack exposes an invalidation callback that can be invoked by
guest_memfd.

Private to Shared conversion via kvm_gmem_convert_range() -
    1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
on each bound memslot overlapping with the range
     2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
actually unmaps the KVM SEPT/NPT entries.
           -> guest_memfd invokes IOMMU invalidation callback to zap
the secure IOMMU entries.
     3) guest_memfd invokes kvm_gmem_execute_work() which updates the
shareability and then splits the folios if needed
     4) Userspace invokes IOMMU map operation to map the ranges in
non-secure IOMMU.

Shared to private conversion via kvm_gmem_convert_range() -
    1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
on each bound memslot overlapping with the range
     2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
actually unmaps the host mappings which will unmap the KVM non-seucure
EPT/NPT entries.
         -> guest_memfd invokes IOMMU invalidation callback to zap the
non-secure IOMMU entries.
     3) guest_memfd invokes kvm_gmem_execute_work() which updates the
shareability and then merges the folios if needed.
     4) Userspace invokes IOMMU map operation to map the ranges in secure IOMMU.

There should be a way to block external IOMMU pagetable updates while
guest_memfd is performing conversion e.g. something like
kvm_invalidate_begin()/end().

> > is going to be flushed on a page conversion anyway (the RMPUPDATE
> > instruction does that). All this is about AMD's x86 though.
>
> The iommu should not be using the VMA to manage the mapping. It should

+1.

> be directly linked to the guestmemfd in some way that does not disturb
> its operations. I imagine there would be some kind of invalidation
> callback directly to the iommu.
>
> Presumably that invalidation call back can include a reason for the
> invalidation (addr change, shared/private conversion, etc)
>
> I'm not sure how we will figure out which case is which but guestmemfd
> should allow the iommu to plug in either invalidation scheme..
>
> Probably invalidation should be a global to the FD thing, I imagine
> that once invalidation is established the iommu will not be
> incrementing page refcounts.

+1.

>
> Jason
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Yan Zhao 5 months, 2 weeks ago
On Tue, Jun 24, 2025 at 07:10:38AM -0700, Vishal Annapurve wrote:
> On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >
> > On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
> >
> > > Now, I am rebasing my RFC on top of this patchset and it fails in
> > > kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
> > > folios in my RFC.
> > >
> > > So what is the expected sequence here? The userspace unmaps a DMA
> > > page and maps it back right away, all from the userspace? The end
> > > result will be the exactly same which seems useless. And IOMMU TLB
> 
>  As Jason described, ideally IOMMU just like KVM, should just:
> 1) Directly rely on guest_memfd for pinning -> no page refcounts taken
> by IOMMU stack
In TDX connect, TDX module and TDs do not trust VMM. So, it's the TDs to inform
TDX module about which pages are used by it for DMAs purposes.
So, if a page is regarded as pinned by TDs for DMA, the TDX module will fail the
unmap of the pages from S-EPT.

If IOMMU side does not increase refcount, IMHO, some way to indicate that
certain PFNs are used by TDs for DMA is still required, so guest_memfd can
reject the request before attempting the actual unmap.
Otherwise, the unmap of TD-DMA-pinned pages will fail.

Upon this kind of unmapping failure, it also doesn't help for host to retry
unmapping without unpinning from TD.


> 2) Directly query pfns from guest_memfd for both shared/private ranges
> 3) Implement an invalidation callback that guest_memfd can invoke on
> conversions.
> 
> Current flow:
> Private to Shared conversion via kvm_gmem_convert_range() -
>     1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
> on each bound memslot overlapping with the range
>          -> KVM has the concept of invalidation_begin() and end(),
> which effectively ensures that between these function calls, no new
> EPT/NPT entries can be added for the range.
>      2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
> actually unmaps the KVM SEPT/NPT entries.
>      3) guest_memfd invokes kvm_gmem_execute_work() which updates the
> shareability and then splits the folios if needed
> 
> Shared to private conversion via kvm_gmem_convert_range() -
>     1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
> on each bound memslot overlapping with the range
>      2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
> actually unmaps the host mappings which will unmap the KVM non-seucure
> EPT/NPT entries.
>      3) guest_memfd invokes kvm_gmem_execute_work() which updates the
> shareability and then merges the folios if needed.
> 
> ============================
> 
> For IOMMU, could something like below work?
> 
> * A new UAPI to bind IOMMU FDs with guest_memfd ranges
> * VFIO_DMA_MAP/UNMAP operations modified to directly fetch pfns from
> guest_memfd ranges using kvm_gmem_get_pfn()
>     -> kvm invokes kvm_gmem_is_private() to check for the range
> shareability, IOMMU could use the same or we could add an API in gmem
> that takes in access type and checks the shareability before returning
> the pfn.
> * IOMMU stack exposes an invalidation callback that can be invoked by
> guest_memfd.
> 
> Private to Shared conversion via kvm_gmem_convert_range() -
>     1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
> on each bound memslot overlapping with the range
>      2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
> actually unmaps the KVM SEPT/NPT entries.
>            -> guest_memfd invokes IOMMU invalidation callback to zap
> the secure IOMMU entries.
If guest_memfd could determine if a page is used by DMA purposes before
attempting the actual unmaps, it could reject and fail the conversion earlier,
thereby keeping IOMMU/S-EPT mappings intact.

This could prevent the conversion from partially failing.

>      3) guest_memfd invokes kvm_gmem_execute_work() which updates the
> shareability and then splits the folios if needed
>      4) Userspace invokes IOMMU map operation to map the ranges in
> non-secure IOMMU.
> 
> Shared to private conversion via kvm_gmem_convert_range() -
>     1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
> on each bound memslot overlapping with the range
>      2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
> actually unmaps the host mappings which will unmap the KVM non-seucure
> EPT/NPT entries.
>          -> guest_memfd invokes IOMMU invalidation callback to zap the
> non-secure IOMMU entries.
>      3) guest_memfd invokes kvm_gmem_execute_work() which updates the
> shareability and then merges the folios if needed.
>      4) Userspace invokes IOMMU map operation to map the ranges in secure IOMMU.
> 
> There should be a way to block external IOMMU pagetable updates while
> guest_memfd is performing conversion e.g. something like
> kvm_invalidate_begin()/end().
> 
> > > is going to be flushed on a page conversion anyway (the RMPUPDATE
> > > instruction does that). All this is about AMD's x86 though.
> >
> > The iommu should not be using the VMA to manage the mapping. It should
> 
> +1.
> 
> > be directly linked to the guestmemfd in some way that does not disturb
> > its operations. I imagine there would be some kind of invalidation
> > callback directly to the iommu.
> >
> > Presumably that invalidation call back can include a reason for the
> > invalidation (addr change, shared/private conversion, etc)
> >
> > I'm not sure how we will figure out which case is which but guestmemfd
> > should allow the iommu to plug in either invalidation scheme..
> >
> > Probably invalidation should be a global to the FD thing, I imagine
> > that once invalidation is established the iommu will not be
> > incrementing page refcounts.
> 
> +1.
> 
> >
> > Jason
> 
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Ackerley Tng 5 months ago
Yan Zhao <yan.y.zhao@intel.com> writes:

> On Tue, Jun 24, 2025 at 07:10:38AM -0700, Vishal Annapurve wrote:
>> On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>> >
>> > On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
>> >
>> > > Now, I am rebasing my RFC on top of this patchset and it fails in
>> > > kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
>> > > folios in my RFC.
>> > >
>> > > So what is the expected sequence here? The userspace unmaps a DMA
>> > > page and maps it back right away, all from the userspace? The end
>> > > result will be the exactly same which seems useless. And IOMMU TLB
>> 
>>  As Jason described, ideally IOMMU just like KVM, should just:
>> 1) Directly rely on guest_memfd for pinning -> no page refcounts taken
>> by IOMMU stack
> In TDX connect, TDX module and TDs do not trust VMM. So, it's the TDs to inform
> TDX module about which pages are used by it for DMAs purposes.
> So, if a page is regarded as pinned by TDs for DMA, the TDX module will fail the
> unmap of the pages from S-EPT.
>
> If IOMMU side does not increase refcount, IMHO, some way to indicate that
> certain PFNs are used by TDs for DMA is still required, so guest_memfd can
> reject the request before attempting the actual unmap.
> Otherwise, the unmap of TD-DMA-pinned pages will fail.
>
> Upon this kind of unmapping failure, it also doesn't help for host to retry
> unmapping without unpinning from TD.
>
>

Yan, Yilun, would it work if, on conversion,

1. guest_memfd notifies IOMMU that a conversion is about to happen for a
   PFN range
2. IOMMU forwards the notification to TDX code in the kernel
3. TDX code in kernel tells TDX module to stop thinking of any PFNs in
   the range as pinned for DMA?

If the above is possible then by the time we get to unmapping from
S-EPTs, TDX module would already consider the PFNs in the range "not
pinned for DMA".

>> 2) Directly query pfns from guest_memfd for both shared/private ranges
>> 3) Implement an invalidation callback that guest_memfd can invoke on
>> conversions.
>> 
>> Current flow:
>> Private to Shared conversion via kvm_gmem_convert_range() -
>>     1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
>> on each bound memslot overlapping with the range
>>          -> KVM has the concept of invalidation_begin() and end(),
>> which effectively ensures that between these function calls, no new
>> EPT/NPT entries can be added for the range.
>>      2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
>> actually unmaps the KVM SEPT/NPT entries.
>>      3) guest_memfd invokes kvm_gmem_execute_work() which updates the
>> shareability and then splits the folios if needed
>> 
>> Shared to private conversion via kvm_gmem_convert_range() -
>>     1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
>> on each bound memslot overlapping with the range
>>      2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
>> actually unmaps the host mappings which will unmap the KVM non-seucure
>> EPT/NPT entries.
>>      3) guest_memfd invokes kvm_gmem_execute_work() which updates the
>> shareability and then merges the folios if needed.
>> 
>> ============================
>> 
>> For IOMMU, could something like below work?
>> 
>> * A new UAPI to bind IOMMU FDs with guest_memfd ranges
>> * VFIO_DMA_MAP/UNMAP operations modified to directly fetch pfns from
>> guest_memfd ranges using kvm_gmem_get_pfn()
>>     -> kvm invokes kvm_gmem_is_private() to check for the range
>> shareability, IOMMU could use the same or we could add an API in gmem
>> that takes in access type and checks the shareability before returning
>> the pfn.
>> * IOMMU stack exposes an invalidation callback that can be invoked by
>> guest_memfd.
>> 
>> Private to Shared conversion via kvm_gmem_convert_range() -
>>     1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
>> on each bound memslot overlapping with the range
>>      2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
>> actually unmaps the KVM SEPT/NPT entries.
>>            -> guest_memfd invokes IOMMU invalidation callback to zap
>> the secure IOMMU entries.
> If guest_memfd could determine if a page is used by DMA purposes before
> attempting the actual unmaps, it could reject and fail the conversion earlier,
> thereby keeping IOMMU/S-EPT mappings intact.
>
> This could prevent the conversion from partially failing.
>

If the above suggestion works, then instead of checking if pages are
allowed to be unmapped, guest_memfd will just force everyone to unmap.

>>      3) guest_memfd invokes kvm_gmem_execute_work() which updates the
>> shareability and then splits the folios if needed
>>      4) Userspace invokes IOMMU map operation to map the ranges in
>> non-secure IOMMU.
>> 
>> Shared to private conversion via kvm_gmem_convert_range() -
>>     1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
>> on each bound memslot overlapping with the range
>>      2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
>> actually unmaps the host mappings which will unmap the KVM non-seucure
>> EPT/NPT entries.
>>          -> guest_memfd invokes IOMMU invalidation callback to zap the
>> non-secure IOMMU entries.
>>      3) guest_memfd invokes kvm_gmem_execute_work() which updates the
>> shareability and then merges the folios if needed.
>>      4) Userspace invokes IOMMU map operation to map the ranges in secure IOMMU.
>> 
>> There should be a way to block external IOMMU pagetable updates while
>> guest_memfd is performing conversion e.g. something like
>> kvm_invalidate_begin()/end().
>> 
>> > > is going to be flushed on a page conversion anyway (the RMPUPDATE
>> > > instruction does that). All this is about AMD's x86 though.
>> >
>> > The iommu should not be using the VMA to manage the mapping. It should
>> 
>> +1.
>> 
>> > be directly linked to the guestmemfd in some way that does not disturb
>> > its operations. I imagine there would be some kind of invalidation
>> > callback directly to the iommu.
>> >
>> > Presumably that invalidation call back can include a reason for the
>> > invalidation (addr change, shared/private conversion, etc)
>> >
>> > I'm not sure how we will figure out which case is which but guestmemfd
>> > should allow the iommu to plug in either invalidation scheme..
>> >
>> > Probably invalidation should be a global to the FD thing, I imagine
>> > that once invalidation is established the iommu will not be
>> > incrementing page refcounts.
>> 
>> +1.
>> 
>> >
>> > Jason
>> 
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Xu Yilun 5 months ago
On Wed, Jul 16, 2025 at 03:22:06PM -0700, Ackerley Tng wrote:
> Yan Zhao <yan.y.zhao@intel.com> writes:
> 
> > On Tue, Jun 24, 2025 at 07:10:38AM -0700, Vishal Annapurve wrote:
> >> On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >> >
> >> > On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
> >> >
> >> > > Now, I am rebasing my RFC on top of this patchset and it fails in
> >> > > kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
> >> > > folios in my RFC.
> >> > >
> >> > > So what is the expected sequence here? The userspace unmaps a DMA
> >> > > page and maps it back right away, all from the userspace? The end
> >> > > result will be the exactly same which seems useless. And IOMMU TLB
> >> 
> >>  As Jason described, ideally IOMMU just like KVM, should just:
> >> 1) Directly rely on guest_memfd for pinning -> no page refcounts taken
> >> by IOMMU stack
> > In TDX connect, TDX module and TDs do not trust VMM. So, it's the TDs to inform
> > TDX module about which pages are used by it for DMAs purposes.
> > So, if a page is regarded as pinned by TDs for DMA, the TDX module will fail the
> > unmap of the pages from S-EPT.
> >
> > If IOMMU side does not increase refcount, IMHO, some way to indicate that
> > certain PFNs are used by TDs for DMA is still required, so guest_memfd can
> > reject the request before attempting the actual unmap.
> > Otherwise, the unmap of TD-DMA-pinned pages will fail.
> >
> > Upon this kind of unmapping failure, it also doesn't help for host to retry
> > unmapping without unpinning from TD.
> >
> >
> 
> Yan, Yilun, would it work if, on conversion,
> 
> 1. guest_memfd notifies IOMMU that a conversion is about to happen for a
>    PFN range

It is the Guest fw call to release the pinning. By the time VMM get the
conversion requirement, the page is already physically unpinned. So I
agree with Jason the pinning doesn't have to reach to iommu from SW POV.

> 2. IOMMU forwards the notification to TDX code in the kernel
> 3. TDX code in kernel tells TDX module to stop thinking of any PFNs in
>    the range as pinned for DMA?

TDX host can't stop the pinning. Actually this mechanism is to prevent
host from unpin/unmap the DMA out of Guest expectation.

Thanks,
Yilun

> 
> If the above is possible then by the time we get to unmapping from
> S-EPTs, TDX module would already consider the PFNs in the range "not
> pinned for DMA".
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Ackerley Tng 5 months ago
Xu Yilun <yilun.xu@linux.intel.com> writes:

> On Wed, Jul 16, 2025 at 03:22:06PM -0700, Ackerley Tng wrote:
>> Yan Zhao <yan.y.zhao@intel.com> writes:
>> 
>> > On Tue, Jun 24, 2025 at 07:10:38AM -0700, Vishal Annapurve wrote:
>> >> On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>> >> >
>> >> > On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
>> >> >
>> >> > > Now, I am rebasing my RFC on top of this patchset and it fails in
>> >> > > kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
>> >> > > folios in my RFC.
>> >> > >
>> >> > > So what is the expected sequence here? The userspace unmaps a DMA
>> >> > > page and maps it back right away, all from the userspace? The end
>> >> > > result will be the exactly same which seems useless. And IOMMU TLB
>> >> 
>> >>  As Jason described, ideally IOMMU just like KVM, should just:
>> >> 1) Directly rely on guest_memfd for pinning -> no page refcounts taken
>> >> by IOMMU stack
>> > In TDX connect, TDX module and TDs do not trust VMM. So, it's the TDs to inform
>> > TDX module about which pages are used by it for DMAs purposes.
>> > So, if a page is regarded as pinned by TDs for DMA, the TDX module will fail the
>> > unmap of the pages from S-EPT.
>> >
>> > If IOMMU side does not increase refcount, IMHO, some way to indicate that
>> > certain PFNs are used by TDs for DMA is still required, so guest_memfd can
>> > reject the request before attempting the actual unmap.
>> > Otherwise, the unmap of TD-DMA-pinned pages will fail.
>> >
>> > Upon this kind of unmapping failure, it also doesn't help for host to retry
>> > unmapping without unpinning from TD.
>> >
>> >
>> 
>> Yan, Yilun, would it work if, on conversion,
>> 
>> 1. guest_memfd notifies IOMMU that a conversion is about to happen for a
>>    PFN range
>
> It is the Guest fw call to release the pinning.

I see, thanks for explaining.

> By the time VMM get the
> conversion requirement, the page is already physically unpinned. So I
> agree with Jason the pinning doesn't have to reach to iommu from SW POV.
>

If by the time KVM gets the conversion request, the page is unpinned,
then we're all good, right?

When guest_memfd gets the conversion request, as part of conversion
handling it will request to zap the page from stage-2 page tables. TDX
module would see that the page is unpinned and the unmapping will
proceed fine. Is that understanding correct?

>> 2. IOMMU forwards the notification to TDX code in the kernel
>> 3. TDX code in kernel tells TDX module to stop thinking of any PFNs in
>>    the range as pinned for DMA?
>
> TDX host can't stop the pinning. Actually this mechanism is to prevent
> host from unpin/unmap the DMA out of Guest expectation.
>

On this note, I'd also like to check something else. Putting TDX connect
and IOMMUs aside, if the host unmaps a guest private page today without
the guest requesting it, the unmapping will work and the guest will be
broken, right?

> Thanks,
> Yilun
>
>> 
>> If the above is possible then by the time we get to unmapping from
>> S-EPTs, TDX module would already consider the PFNs in the range "not
>> pinned for DMA".
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Xu Yilun 5 months ago
On Thu, Jul 17, 2025 at 09:56:01AM -0700, Ackerley Tng wrote:
> Xu Yilun <yilun.xu@linux.intel.com> writes:
> 
> > On Wed, Jul 16, 2025 at 03:22:06PM -0700, Ackerley Tng wrote:
> >> Yan Zhao <yan.y.zhao@intel.com> writes:
> >> 
> >> > On Tue, Jun 24, 2025 at 07:10:38AM -0700, Vishal Annapurve wrote:
> >> >> On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >> >> >
> >> >> > On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
> >> >> >
> >> >> > > Now, I am rebasing my RFC on top of this patchset and it fails in
> >> >> > > kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
> >> >> > > folios in my RFC.
> >> >> > >
> >> >> > > So what is the expected sequence here? The userspace unmaps a DMA
> >> >> > > page and maps it back right away, all from the userspace? The end
> >> >> > > result will be the exactly same which seems useless. And IOMMU TLB
> >> >> 
> >> >>  As Jason described, ideally IOMMU just like KVM, should just:
> >> >> 1) Directly rely on guest_memfd for pinning -> no page refcounts taken
> >> >> by IOMMU stack
> >> > In TDX connect, TDX module and TDs do not trust VMM. So, it's the TDs to inform
> >> > TDX module about which pages are used by it for DMAs purposes.
> >> > So, if a page is regarded as pinned by TDs for DMA, the TDX module will fail the
> >> > unmap of the pages from S-EPT.
> >> >
> >> > If IOMMU side does not increase refcount, IMHO, some way to indicate that
> >> > certain PFNs are used by TDs for DMA is still required, so guest_memfd can
> >> > reject the request before attempting the actual unmap.
> >> > Otherwise, the unmap of TD-DMA-pinned pages will fail.
> >> >
> >> > Upon this kind of unmapping failure, it also doesn't help for host to retry
> >> > unmapping without unpinning from TD.
> >> >
> >> >
> >> 
> >> Yan, Yilun, would it work if, on conversion,
> >> 
> >> 1. guest_memfd notifies IOMMU that a conversion is about to happen for a
> >>    PFN range
> >
> > It is the Guest fw call to release the pinning.
> 
> I see, thanks for explaining.
> 
> > By the time VMM get the
> > conversion requirement, the page is already physically unpinned. So I
> > agree with Jason the pinning doesn't have to reach to iommu from SW POV.
> >
> 
> If by the time KVM gets the conversion request, the page is unpinned,
> then we're all good, right?

Yes, unless guest doesn't unpin the page first by mistake. Guest would
invoke a fw call tdg.mem.page.release to unpin the page before
KVM_HC_MAP_GPA_RANGE.

> 
> When guest_memfd gets the conversion request, as part of conversion
> handling it will request to zap the page from stage-2 page tables. TDX
> module would see that the page is unpinned and the unmapping will
> proceed fine. Is that understanding correct?

Yes, again unless guess doesn't unpin.

> 
> >> 2. IOMMU forwards the notification to TDX code in the kernel
> >> 3. TDX code in kernel tells TDX module to stop thinking of any PFNs in
> >>    the range as pinned for DMA?
> >
> > TDX host can't stop the pinning. Actually this mechanism is to prevent
> > host from unpin/unmap the DMA out of Guest expectation.
> >
> 
> On this note, I'd also like to check something else. Putting TDX connect
> and IOMMUs aside, if the host unmaps a guest private page today without
> the guest requesting it, the unmapping will work and the guest will be
> broken, right?

Correct. The unmapping will work, the guest can't continue anymore.

Thanks,
Yilun
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Ira Weiny 5 months ago
Xu Yilun wrote:
> On Thu, Jul 17, 2025 at 09:56:01AM -0700, Ackerley Tng wrote:
> > Xu Yilun <yilun.xu@linux.intel.com> writes:
> > 
> > > On Wed, Jul 16, 2025 at 03:22:06PM -0700, Ackerley Tng wrote:
> > >> Yan Zhao <yan.y.zhao@intel.com> writes:
> > >> 
> > >> > On Tue, Jun 24, 2025 at 07:10:38AM -0700, Vishal Annapurve wrote:
> > >> >> On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > >> >> >
> > >> >> > On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
> > >> >> >
> > >> >> > > Now, I am rebasing my RFC on top of this patchset and it fails in
> > >> >> > > kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
> > >> >> > > folios in my RFC.
> > >> >> > >
> > >> >> > > So what is the expected sequence here? The userspace unmaps a DMA
> > >> >> > > page and maps it back right away, all from the userspace? The end
> > >> >> > > result will be the exactly same which seems useless. And IOMMU TLB
> > >> >> 
> > >> >>  As Jason described, ideally IOMMU just like KVM, should just:
> > >> >> 1) Directly rely on guest_memfd for pinning -> no page refcounts taken
> > >> >> by IOMMU stack
> > >> > In TDX connect, TDX module and TDs do not trust VMM. So, it's the TDs to inform
> > >> > TDX module about which pages are used by it for DMAs purposes.
> > >> > So, if a page is regarded as pinned by TDs for DMA, the TDX module will fail the
> > >> > unmap of the pages from S-EPT.
> > >> >
> > >> > If IOMMU side does not increase refcount, IMHO, some way to indicate that
> > >> > certain PFNs are used by TDs for DMA is still required, so guest_memfd can
> > >> > reject the request before attempting the actual unmap.
> > >> > Otherwise, the unmap of TD-DMA-pinned pages will fail.
> > >> >
> > >> > Upon this kind of unmapping failure, it also doesn't help for host to retry
> > >> > unmapping without unpinning from TD.
> > >> >
> > >> >
> > >> 
> > >> Yan, Yilun, would it work if, on conversion,
> > >> 
> > >> 1. guest_memfd notifies IOMMU that a conversion is about to happen for a
> > >>    PFN range
> > >
> > > It is the Guest fw call to release the pinning.
> > 
> > I see, thanks for explaining.
> > 
> > > By the time VMM get the
> > > conversion requirement, the page is already physically unpinned. So I
> > > agree with Jason the pinning doesn't have to reach to iommu from SW POV.
> > >
> > 
> > If by the time KVM gets the conversion request, the page is unpinned,
> > then we're all good, right?
> 
> Yes, unless guest doesn't unpin the page first by mistake.

Or maliciously?  :-(

My initial response to this was that this is a bug and we don't need to be
concerned with it.  However, can't this be a DOS from one TD to crash the
system if the host uses the private page for something else and the
machine #MC's?

Ira

> Guest would
> invoke a fw call tdg.mem.page.release to unpin the page before
> KVM_HC_MAP_GPA_RANGE.
> 
> > 
> > When guest_memfd gets the conversion request, as part of conversion
> > handling it will request to zap the page from stage-2 page tables. TDX
> > module would see that the page is unpinned and the unmapping will
> > proceed fine. Is that understanding correct?
> 
> Yes, again unless guess doesn't unpin.
> 
> > 
> > >> 2. IOMMU forwards the notification to TDX code in the kernel
> > >> 3. TDX code in kernel tells TDX module to stop thinking of any PFNs in
> > >>    the range as pinned for DMA?
> > >
> > > TDX host can't stop the pinning. Actually this mechanism is to prevent
> > > host from unpin/unmap the DMA out of Guest expectation.
> > >
> > 
> > On this note, I'd also like to check something else. Putting TDX connect
> > and IOMMUs aside, if the host unmaps a guest private page today without
> > the guest requesting it, the unmapping will work and the guest will be
> > broken, right?
> 
> Correct. The unmapping will work, the guest can't continue anymore.
> 
> Thanks,
> Yilun


Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Xu Yilun 4 months, 4 weeks ago
> > > >> Yan, Yilun, would it work if, on conversion,
> > > >> 
> > > >> 1. guest_memfd notifies IOMMU that a conversion is about to happen for a
> > > >>    PFN range
> > > >
> > > > It is the Guest fw call to release the pinning.
> > > 
> > > I see, thanks for explaining.
> > > 
> > > > By the time VMM get the
> > > > conversion requirement, the page is already physically unpinned. So I
> > > > agree with Jason the pinning doesn't have to reach to iommu from SW POV.
> > > >
> > > 
> > > If by the time KVM gets the conversion request, the page is unpinned,
> > > then we're all good, right?
> > 
> > Yes, unless guest doesn't unpin the page first by mistake.
> 
> Or maliciously?  :-(

Yes.

> 
> My initial response to this was that this is a bug and we don't need to be
> concerned with it.  However, can't this be a DOS from one TD to crash the
> system if the host uses the private page for something else and the
> machine #MC's?

I think we are already doing something to prevent vcpus from executing
then destroy VM, so no further TD accessing. But I assume there is
concern a TD could just leak a lot of resources, and we are
investigating if host can reclaim them.

Thanks,
Yilun
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Ackerley Tng 4 months, 4 weeks ago
Xu Yilun <yilun.xu@linux.intel.com> writes:

>> > > >> Yan, Yilun, would it work if, on conversion,
>> > > >> 
>> > > >> 1. guest_memfd notifies IOMMU that a conversion is about to happen for a
>> > > >>    PFN range
>> > > >
>> > > > It is the Guest fw call to release the pinning.
>> > > 
>> > > I see, thanks for explaining.
>> > > 
>> > > > By the time VMM get the
>> > > > conversion requirement, the page is already physically unpinned. So I
>> > > > agree with Jason the pinning doesn't have to reach to iommu from SW POV.
>> > > >
>> > > 
>> > > If by the time KVM gets the conversion request, the page is unpinned,
>> > > then we're all good, right?
>> > 
>> > Yes, unless guest doesn't unpin the page first by mistake.
>> 
>> Or maliciously?  :-(
>
> Yes.
>
>> 
>> My initial response to this was that this is a bug and we don't need to be
>> concerned with it.  However, can't this be a DOS from one TD to crash the
>> system if the host uses the private page for something else and the
>> machine #MC's?
>
> I think we are already doing something to prevent vcpus from executing
> then destroy VM, so no further TD accessing. But I assume there is
> concern a TD could just leak a lot of resources, and we are
> investigating if host can reclaim them.
>
> Thanks,
> Yilun

Sounds like a malicious guest could skip unpinning private memory, and
guest_memfd's unmap will fail, leading to a KVM_BUG_ON() as Yan/Rick
suggested here [1].

Actually it seems like a legacy guest would also lead to unmap failures
and the KVM_BUG_ON(), since when TDX connect is enabled, the pinning
mode is enforced, even for non-IO private pages?

I hope your team's investigations find a good way for the host to
reclaim memory, at least from dead TDs! Otherwise this would be an open
hole for guests to leak a host's memory.

Circling back to the original topic [2], it sounds like we're okay for
IOMMU to *not* take any refcounts on pages and can rely on guest_memfd
to keep the page around on behalf of the VM?

[1] https://lore.kernel.org/all/diqzcya13x2j.fsf@ackerleytng-ctop.c.googlers.com/
[2] https://lore.kernel.org/all/CAGtprH_qh8sEY3s-JucW3n1Wvoq7jdVZDDokvG5HzPf0HV2=pg@mail.gmail.com/
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Edgecombe, Rick P 4 months, 4 weeks ago
On Tue, 2025-07-22 at 11:17 -0700, Ackerley Tng wrote:
> Sounds like a malicious guest could skip unpinning private memory, and
> guest_memfd's unmap will fail, leading to a KVM_BUG_ON() as Yan/Rick
> suggested here [1].
> 
> Actually it seems like a legacy guest would also lead to unmap failures
> and the KVM_BUG_ON(), since when TDX connect is enabled, the pinning
> mode is enforced, even for non-IO private pages?
> 
> I hope your team's investigations find a good way for the host to
> reclaim memory, at least from dead TDs! Otherwise this would be an open
> hole for guests to leak a host's memory.
> 
> Circling back to the original topic [2], it sounds like we're okay for
> IOMMU to *not* take any refcounts on pages and can rely on guest_memfd
> to keep the page around on behalf of the VM?
> 
> [1] https://lore.kernel.org/all/diqzcya13x2j.fsf@ackerleytng-ctop.c.googlers.com/
> [2] https://lore.kernel.org/all/CAGtprH_qh8sEY3s-JucW3n1Wvoq7jdVZDDokvG5HzPf0HV2=pg@mail.gmail.com/

Djbw, Yilun and I had a chat yesterday. We'll investigate a way to have an
operation that can't fail and will allow total cleanup and reclaim for the TD's
resources, as well as a per-TDX module scoped version. 

If host userspace or the guest kernel does something wrong, the guest can be
destroyed in the normal VM case. So we can try to use these operations as a way
to save host kernel complexity for cases like that. But if an error condition
might come up in normal cases (i.e. rare races, non-bugs) we need to look to
other error handling solutions.

We were planning to investigate first and then share back to the list. It
probably deserves broader consideration beyond folks still reading deep down in
this thread.
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Jason Gunthorpe 5 months ago
On Fri, Jul 18, 2025 at 10:48:55AM +0800, Xu Yilun wrote:
> > If by the time KVM gets the conversion request, the page is unpinned,
> > then we're all good, right?
> 
> Yes, unless guest doesn't unpin the page first by mistake. Guest would
> invoke a fw call tdg.mem.page.release to unpin the page before
> KVM_HC_MAP_GPA_RANGE.

What does guest pinning mean?

Jason
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Xu Yilun 4 months, 4 weeks ago
On Fri, Jul 18, 2025 at 11:15:59AM -0300, Jason Gunthorpe wrote:
> On Fri, Jul 18, 2025 at 10:48:55AM +0800, Xu Yilun wrote:
> > > If by the time KVM gets the conversion request, the page is unpinned,
> > > then we're all good, right?
> > 
> > Yes, unless guest doesn't unpin the page first by mistake. Guest would
> > invoke a fw call tdg.mem.page.release to unpin the page before
> > KVM_HC_MAP_GPA_RANGE.
> 
> What does guest pinning mean?

TDX firmware provides a mode, that host can't block the S-EPT mapping
after TD accepts the mapping. Guest 'pins' the private mapping (KVM &
IOMMU).

TD should explicitly unaccept the page by tdg.mem.page.release, then
host could successfully block/unmap the S-EPT. This is necessary when
shared <-> private conversion.

When TDX Connect is enabled, this mode is enforced.

Thanks,
Yilun

> 
> Jason
>
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Vishal Annapurve 5 months, 2 weeks ago
On Wed, Jul 2, 2025 at 1:38 AM Yan Zhao <yan.y.zhao@intel.com> wrote:
>
> On Tue, Jun 24, 2025 at 07:10:38AM -0700, Vishal Annapurve wrote:
> > On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > >
> > > On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
> > >
> > > > Now, I am rebasing my RFC on top of this patchset and it fails in
> > > > kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
> > > > folios in my RFC.
> > > >
> > > > So what is the expected sequence here? The userspace unmaps a DMA
> > > > page and maps it back right away, all from the userspace? The end
> > > > result will be the exactly same which seems useless. And IOMMU TLB
> >
> >  As Jason described, ideally IOMMU just like KVM, should just:
> > 1) Directly rely on guest_memfd for pinning -> no page refcounts taken
> > by IOMMU stack
> In TDX connect, TDX module and TDs do not trust VMM. So, it's the TDs to inform
> TDX module about which pages are used by it for DMAs purposes.
> So, if a page is regarded as pinned by TDs for DMA, the TDX module will fail the
> unmap of the pages from S-EPT.
>
> If IOMMU side does not increase refcount, IMHO, some way to indicate that
> certain PFNs are used by TDs for DMA is still required, so guest_memfd can
> reject the request before attempting the actual unmap.

So it looks like guest_memfd will need an interface with KVM/IOMMU
backends to check if unmapping can succeed. And if unmapping still
fails, there should be a way for KVM/IOMMU backends to kill the TD and
any TDIs bound to that TD.

> Otherwise, the unmap of TD-DMA-pinned pages will fail.
>
> Upon this kind of unmapping failure, it also doesn't help for host to retry
> unmapping without unpinning from TD.
>
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Jason Gunthorpe 5 months, 2 weeks ago
On Wed, Jul 02, 2025 at 06:54:10AM -0700, Vishal Annapurve wrote:
> On Wed, Jul 2, 2025 at 1:38 AM Yan Zhao <yan.y.zhao@intel.com> wrote:
> >
> > On Tue, Jun 24, 2025 at 07:10:38AM -0700, Vishal Annapurve wrote:
> > > On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > > >
> > > > On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
> > > >
> > > > > Now, I am rebasing my RFC on top of this patchset and it fails in
> > > > > kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
> > > > > folios in my RFC.
> > > > >
> > > > > So what is the expected sequence here? The userspace unmaps a DMA
> > > > > page and maps it back right away, all from the userspace? The end
> > > > > result will be the exactly same which seems useless. And IOMMU TLB
> > >
> > >  As Jason described, ideally IOMMU just like KVM, should just:
> > > 1) Directly rely on guest_memfd for pinning -> no page refcounts taken
> > > by IOMMU stack
> > In TDX connect, TDX module and TDs do not trust VMM. So, it's the TDs to inform
> > TDX module about which pages are used by it for DMAs purposes.
> > So, if a page is regarded as pinned by TDs for DMA, the TDX module will fail the
> > unmap of the pages from S-EPT.

I don't see this as having much to do with iommufd.

iommufd will somehow support the T=1 iommu inside the TDX module but
it won't have an IOAS for it since the VMM does not control the
translation.

The discussion here is for the T=0 iommu which is controlled by
iommufd and does have an IOAS. It should be popoulated with all the
shared pages from the guestmemfd.

> > If IOMMU side does not increase refcount, IMHO, some way to indicate that
> > certain PFNs are used by TDs for DMA is still required, so guest_memfd can
> > reject the request before attempting the actual unmap.

This has to be delt with between the TDX module and KVM. When KVM
gives pages to become secure it may not be able to get them back..

This problem has nothing to do with iommufd.

But generally I expect that the T=1 iommu follows the S-EPT entirely
and there is no notion of pages "locked for dma". If DMA is ongoing
and a page is made non-secure then the DMA fails.

Obviously in a mode where there is a vPCI device we will need all the
pages to be pinned in the guestmemfd to prevent any kind of
migrations. Only shared/private conversions should change the page
around.

Maybe this needs to be an integral functionality in guestmemfd?

Jason
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Vishal Annapurve 5 months, 2 weeks ago
On Wed, Jul 2, 2025 at 7:13 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Wed, Jul 02, 2025 at 06:54:10AM -0700, Vishal Annapurve wrote:
> > On Wed, Jul 2, 2025 at 1:38 AM Yan Zhao <yan.y.zhao@intel.com> wrote:
> > >
> > > On Tue, Jun 24, 2025 at 07:10:38AM -0700, Vishal Annapurve wrote:
> > > > On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > > > >
> > > > > On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
> > > > >
> > > > > > Now, I am rebasing my RFC on top of this patchset and it fails in
> > > > > > kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
> > > > > > folios in my RFC.
> > > > > >
> > > > > > So what is the expected sequence here? The userspace unmaps a DMA
> > > > > > page and maps it back right away, all from the userspace? The end
> > > > > > result will be the exactly same which seems useless. And IOMMU TLB
> > > >
> > > >  As Jason described, ideally IOMMU just like KVM, should just:
> > > > 1) Directly rely on guest_memfd for pinning -> no page refcounts taken
> > > > by IOMMU stack
> > > In TDX connect, TDX module and TDs do not trust VMM. So, it's the TDs to inform
> > > TDX module about which pages are used by it for DMAs purposes.
> > > So, if a page is regarded as pinned by TDs for DMA, the TDX module will fail the
> > > unmap of the pages from S-EPT.
>
> I don't see this as having much to do with iommufd.
>
> iommufd will somehow support the T=1 iommu inside the TDX module but
> it won't have an IOAS for it since the VMM does not control the
> translation.
>
> The discussion here is for the T=0 iommu which is controlled by
> iommufd and does have an IOAS. It should be popoulated with all the
> shared pages from the guestmemfd.
>
> > > If IOMMU side does not increase refcount, IMHO, some way to indicate that
> > > certain PFNs are used by TDs for DMA is still required, so guest_memfd can
> > > reject the request before attempting the actual unmap.
>
> This has to be delt with between the TDX module and KVM. When KVM
> gives pages to become secure it may not be able to get them back..
>
> This problem has nothing to do with iommufd.
>
> But generally I expect that the T=1 iommu follows the S-EPT entirely
> and there is no notion of pages "locked for dma". If DMA is ongoing
> and a page is made non-secure then the DMA fails.
>
> Obviously in a mode where there is a vPCI device we will need all the
> pages to be pinned in the guestmemfd to prevent any kind of
> migrations. Only shared/private conversions should change the page
> around.

Yes, guest_memfd ensures that all the faulted-in pages (irrespective
of shared or private ranges) are not migratable. We already have a
similar restriction with CPU accesses to encrypted memory ranges that
need arch specific protocols to migrate memory contents.

>
> Maybe this needs to be an integral functionality in guestmemfd?
>
> Jason
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Xu Yilun 5 months, 1 week ago
On Wed, Jul 02, 2025 at 07:32:36AM -0700, Vishal Annapurve wrote:
> On Wed, Jul 2, 2025 at 7:13 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >
> > On Wed, Jul 02, 2025 at 06:54:10AM -0700, Vishal Annapurve wrote:
> > > On Wed, Jul 2, 2025 at 1:38 AM Yan Zhao <yan.y.zhao@intel.com> wrote:
> > > >
> > > > On Tue, Jun 24, 2025 at 07:10:38AM -0700, Vishal Annapurve wrote:
> > > > > On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > > > > >
> > > > > > On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
> > > > > >
> > > > > > > Now, I am rebasing my RFC on top of this patchset and it fails in
> > > > > > > kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
> > > > > > > folios in my RFC.
> > > > > > >
> > > > > > > So what is the expected sequence here? The userspace unmaps a DMA
> > > > > > > page and maps it back right away, all from the userspace? The end
> > > > > > > result will be the exactly same which seems useless. And IOMMU TLB
> > > > >
> > > > >  As Jason described, ideally IOMMU just like KVM, should just:
> > > > > 1) Directly rely on guest_memfd for pinning -> no page refcounts taken
> > > > > by IOMMU stack
> > > > In TDX connect, TDX module and TDs do not trust VMM. So, it's the TDs to inform
> > > > TDX module about which pages are used by it for DMAs purposes.
> > > > So, if a page is regarded as pinned by TDs for DMA, the TDX module will fail the
> > > > unmap of the pages from S-EPT.
> >
> > I don't see this as having much to do with iommufd.
> >
> > iommufd will somehow support the T=1 iommu inside the TDX module but
> > it won't have an IOAS for it since the VMM does not control the
> > translation.

I partially agree with this.

This is still the DMA Silent drop issue for security.  The HW (Also
applicable to AMD/ARM) screams out if the trusted DMA path (IOMMU
mapping, or access control table like RMP) is changed out of TD's
expectation. So from HW POV, it is the iommu problem.

For SW, if we don't blame iommu, maybe we rephrase as gmemfd can't
invalidate private pages unless TD agrees.

> >
> > The discussion here is for the T=0 iommu which is controlled by
> > iommufd and does have an IOAS. It should be popoulated with all the
> > shared pages from the guestmemfd.
> >
> > > > If IOMMU side does not increase refcount, IMHO, some way to indicate that
> > > > certain PFNs are used by TDs for DMA is still required, so guest_memfd can
> > > > reject the request before attempting the actual unmap.
> >
> > This has to be delt with between the TDX module and KVM. When KVM
> > gives pages to become secure it may not be able to get them back..

Just to be clear. With In-place conversion, it is not KVM gives pages
to become secure, it is gmemfd. Or maybe you mean gmemfd is part of KVM.

https://lore.kernel.org/all/aC86OsU2HSFZkJP6@google.com/

> >
> > This problem has nothing to do with iommufd.
> >
> > But generally I expect that the T=1 iommu follows the S-EPT entirely
> > and there is no notion of pages "locked for dma". If DMA is ongoing
> > and a page is made non-secure then the DMA fails.
> >
> > Obviously in a mode where there is a vPCI device we will need all the
> > pages to be pinned in the guestmemfd to prevent any kind of
> > migrations. Only shared/private conversions should change the page
> > around.

Only *guest permitted* conversion should change the page. I.e only when
VMM is dealing with the KVM_HC_MAP_GPA_RANGE hypercall. Not sure if we
could just let QEMU ensure this or KVM/guestmemfd should ensure this.

Thanks,
Yilun

> 
> Yes, guest_memfd ensures that all the faulted-in pages (irrespective
> of shared or private ranges) are not migratable. We already have a
> similar restriction with CPU accesses to encrypted memory ranges that
> need arch specific protocols to migrate memory contents.
> 
> >
> > Maybe this needs to be an integral functionality in guestmemfd?
> >
> > Jason
> 
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Jason Gunthorpe 5 months, 1 week ago
On Thu, Jul 10, 2025 at 06:50:09PM +0800, Xu Yilun wrote:
> On Wed, Jul 02, 2025 at 07:32:36AM -0700, Vishal Annapurve wrote:
> > On Wed, Jul 2, 2025 at 7:13 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > >
> > > On Wed, Jul 02, 2025 at 06:54:10AM -0700, Vishal Annapurve wrote:
> > > > On Wed, Jul 2, 2025 at 1:38 AM Yan Zhao <yan.y.zhao@intel.com> wrote:
> > > > >
> > > > > On Tue, Jun 24, 2025 at 07:10:38AM -0700, Vishal Annapurve wrote:
> > > > > > On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > > > > > >
> > > > > > > On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
> > > > > > >
> > > > > > > > Now, I am rebasing my RFC on top of this patchset and it fails in
> > > > > > > > kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
> > > > > > > > folios in my RFC.
> > > > > > > >
> > > > > > > > So what is the expected sequence here? The userspace unmaps a DMA
> > > > > > > > page and maps it back right away, all from the userspace? The end
> > > > > > > > result will be the exactly same which seems useless. And IOMMU TLB
> > > > > >
> > > > > >  As Jason described, ideally IOMMU just like KVM, should just:
> > > > > > 1) Directly rely on guest_memfd for pinning -> no page refcounts taken
> > > > > > by IOMMU stack
> > > > > In TDX connect, TDX module and TDs do not trust VMM. So, it's the TDs to inform
> > > > > TDX module about which pages are used by it for DMAs purposes.
> > > > > So, if a page is regarded as pinned by TDs for DMA, the TDX module will fail the
> > > > > unmap of the pages from S-EPT.
> > >
> > > I don't see this as having much to do with iommufd.
> > >
> > > iommufd will somehow support the T=1 iommu inside the TDX module but
> > > it won't have an IOAS for it since the VMM does not control the
> > > translation.
> 
> I partially agree with this.
> 
> This is still the DMA Silent drop issue for security.  The HW (Also
> applicable to AMD/ARM) screams out if the trusted DMA path (IOMMU
> mapping, or access control table like RMP) is changed out of TD's
> expectation. So from HW POV, it is the iommu problem.

I thought the basic idea was that the secure world would sanity check
what the insecure is doing and if it is not OK then it blows up. So if
the DMA fails because the untrusted world revoked sharability when it
should not have then this is correct and expected?

> For SW, if we don't blame iommu, maybe we rephrase as gmemfd can't
> invalidate private pages unless TD agrees.

I think you mean guestmemfd in the kernel cannot autonomously change
'something' unless instructed to explicitly by userspace.

The expectation is the userspace will only give such instructions
based on the VM telling it to do a shared/private change.

If userspace gives an instruction that was not agreed with the guest
then the secure world can police the error and blow up.
 
> Just to be clear. With In-place conversion, it is not KVM gives pages
> to become secure, it is gmemfd. Or maybe you mean gmemfd is part of KVM.

Yeah, I mean part of.

> > > Obviously in a mode where there is a vPCI device we will need all the
> > > pages to be pinned in the guestmemfd to prevent any kind of
> > > migrations. Only shared/private conversions should change the page
> > > around.
> 
> Only *guest permitted* conversion should change the page. I.e only when
> VMM is dealing with the KVM_HC_MAP_GPA_RANGE hypercall. Not sure if we
> could just let QEMU ensure this or KVM/guestmemfd should ensure this.

I think it should not be part of the kernel, no need. From a kernel
perspective userspace has requested a shared/private conversion and if
it wasn't agreed with the VM then it will explode.

Jason
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Xu Yilun 5 months, 1 week ago
On Thu, Jul 10, 2025 at 02:54:49PM -0300, Jason Gunthorpe wrote:
> On Thu, Jul 10, 2025 at 06:50:09PM +0800, Xu Yilun wrote:
> > On Wed, Jul 02, 2025 at 07:32:36AM -0700, Vishal Annapurve wrote:
> > > On Wed, Jul 2, 2025 at 7:13 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > > >
> > > > On Wed, Jul 02, 2025 at 06:54:10AM -0700, Vishal Annapurve wrote:
> > > > > On Wed, Jul 2, 2025 at 1:38 AM Yan Zhao <yan.y.zhao@intel.com> wrote:
> > > > > >
> > > > > > On Tue, Jun 24, 2025 at 07:10:38AM -0700, Vishal Annapurve wrote:
> > > > > > > On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > > > > > > >
> > > > > > > > On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
> > > > > > > >
> > > > > > > > > Now, I am rebasing my RFC on top of this patchset and it fails in
> > > > > > > > > kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
> > > > > > > > > folios in my RFC.
> > > > > > > > >
> > > > > > > > > So what is the expected sequence here? The userspace unmaps a DMA
> > > > > > > > > page and maps it back right away, all from the userspace? The end
> > > > > > > > > result will be the exactly same which seems useless. And IOMMU TLB
> > > > > > >
> > > > > > >  As Jason described, ideally IOMMU just like KVM, should just:
> > > > > > > 1) Directly rely on guest_memfd for pinning -> no page refcounts taken
> > > > > > > by IOMMU stack
> > > > > > In TDX connect, TDX module and TDs do not trust VMM. So, it's the TDs to inform
> > > > > > TDX module about which pages are used by it for DMAs purposes.
> > > > > > So, if a page is regarded as pinned by TDs for DMA, the TDX module will fail the
> > > > > > unmap of the pages from S-EPT.
> > > >
> > > > I don't see this as having much to do with iommufd.
> > > >
> > > > iommufd will somehow support the T=1 iommu inside the TDX module but
> > > > it won't have an IOAS for it since the VMM does not control the
> > > > translation.
> > 
> > I partially agree with this.
> > 
> > This is still the DMA Silent drop issue for security.  The HW (Also
> > applicable to AMD/ARM) screams out if the trusted DMA path (IOMMU
> > mapping, or access control table like RMP) is changed out of TD's
> > expectation. So from HW POV, it is the iommu problem.
> 
> I thought the basic idea was that the secure world would sanity check
> what the insecure is doing and if it is not OK then it blows up. So if

Yes. The secure world checks. But it let alone the unexpected change on
CPU path cause CPU is synchronous and VM just pends on the fault, no
security concern. While DMA is asynchronous and the secure world must
blow up.

> the DMA fails because the untrusted world revoked sharability when it
> should not have then this is correct and expected?

OK. From secure world POV the failing is correct & expected.

> 
> > For SW, if we don't blame iommu, maybe we rephrase as gmemfd can't
> > invalidate private pages unless TD agrees.
> 
> I think you mean guestmemfd in the kernel cannot autonomously change
> 'something' unless instructed to explicitly by userspace.
> 
> The expectation is the userspace will only give such instructions
> based on the VM telling it to do a shared/private change.
> 
> If userspace gives an instruction that was not agreed with the guest
> then the secure world can police the error and blow up.

Yes.

>  
> > Just to be clear. With In-place conversion, it is not KVM gives pages
> > to become secure, it is gmemfd. Or maybe you mean gmemfd is part of KVM.
> 
> Yeah, I mean part of.
> 
> > > > Obviously in a mode where there is a vPCI device we will need all the
> > > > pages to be pinned in the guestmemfd to prevent any kind of
> > > > migrations. Only shared/private conversions should change the page
> > > > around.
> > 
> > Only *guest permitted* conversion should change the page. I.e only when
> > VMM is dealing with the KVM_HC_MAP_GPA_RANGE hypercall. Not sure if we
> > could just let QEMU ensure this or KVM/guestmemfd should ensure this.
> 
> I think it should not be part of the kernel, no need. From a kernel
> perspective userspace has requested a shared/private conversion and if
> it wasn't agreed with the VM then it will explode.

I'm OK with it now. It's simple if we don't try to recover from the
explosion. Although I see the after explosion processing in kernel is
complex and not sure how it will advance.

Thanks,
Yilun

> 
> Jason
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Xu Yilun 5 months, 1 week ago
> > > 
> > > Only *guest permitted* conversion should change the page. I.e only when
> > > VMM is dealing with the KVM_HC_MAP_GPA_RANGE hypercall. Not sure if we
> > > could just let QEMU ensure this or KVM/guestmemfd should ensure this.
> > 
> > I think it should not be part of the kernel, no need. From a kernel
> > perspective userspace has requested a shared/private conversion and if
> > it wasn't agreed with the VM then it will explode.
> 
> I'm OK with it now. It's simple if we don't try to recover from the
> explosion. Although I see the after explosion processing in kernel is
> complex and not sure how it will advance.

I see the discussion in another thread about similar issue. That TDX
Module BUG causes S-EPT unmap impossible and just KVM_BUG_ON(). But this
conversion issue is a little different, usually it's not decent to panic
because of userspace request. So may need further error handling, or a
KVM/gmemfd kAPI to disallow/allow conversion and prevent more complex
error.

Thanks,
Yilun

> 
> Thanks,
> Yilun
> 
> > 
> > Jason
>
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Alexey Kardashevskiy 5 months, 3 weeks ago

On 25/6/25 00:10, Vishal Annapurve wrote:
> On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>>
>> On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
>>
>>> Now, I am rebasing my RFC on top of this patchset and it fails in
>>> kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
>>> folios in my RFC.
>>>
>>> So what is the expected sequence here? The userspace unmaps a DMA
>>> page and maps it back right away, all from the userspace? The end
>>> result will be the exactly same which seems useless. And IOMMU TLB
> 
>   As Jason described, ideally IOMMU just like KVM, should just:
> 1) Directly rely on guest_memfd for pinning -> no page refcounts taken
> by IOMMU stack
> 2) Directly query pfns from guest_memfd for both shared/private ranges
> 3) Implement an invalidation callback that guest_memfd can invoke on
> conversions.
> 
> Current flow:
> Private to Shared conversion via kvm_gmem_convert_range() -
>      1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
> on each bound memslot overlapping with the range
>           -> KVM has the concept of invalidation_begin() and end(),
> which effectively ensures that between these function calls, no new
> EPT/NPT entries can be added for the range.
>       2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
> actually unmaps the KVM SEPT/NPT entries.
>       3) guest_memfd invokes kvm_gmem_execute_work() which updates the
> shareability and then splits the folios if needed
> 
> Shared to private conversion via kvm_gmem_convert_range() -
>      1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
> on each bound memslot overlapping with the range
>       2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
> actually unmaps the host mappings which will unmap the KVM non-seucure
> EPT/NPT entries.
>       3) guest_memfd invokes kvm_gmem_execute_work() which updates the
> shareability and then merges the folios if needed.
> 
> ============================
> 
> For IOMMU, could something like below work?
> 
> * A new UAPI to bind IOMMU FDs with guest_memfd ranges

Done that.

> * VFIO_DMA_MAP/UNMAP operations modified to directly fetch pfns from
> guest_memfd ranges using kvm_gmem_get_pfn()

This API imho should drop the confusing kvm_ prefix.

>      -> kvm invokes kvm_gmem_is_private() to check for the range
> shareability, IOMMU could use the same or we could add an API in gmem
> that takes in access type and checks the shareability before returning
> the pfn.

Right now I cutnpasted kvm_gmem_get_folio() (which essentially is filemap_lock_folio()/filemap_alloc_folio()/__filemap_add_folio()) to avoid new links between iommufd.ko and kvm.ko. It is probably unavoidable though.


> * IOMMU stack exposes an invalidation callback that can be invoked by
> guest_memfd.
> 
> Private to Shared conversion via kvm_gmem_convert_range() -
>      1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
> on each bound memslot overlapping with the range
>       2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
> actually unmaps the KVM SEPT/NPT entries.
>             -> guest_memfd invokes IOMMU invalidation callback to zap
> the secure IOMMU entries.
>       3) guest_memfd invokes kvm_gmem_execute_work() which updates the
> shareability and then splits the folios if needed
>       4) Userspace invokes IOMMU map operation to map the ranges in
> non-secure IOMMU.
> 
> Shared to private conversion via kvm_gmem_convert_range() -
>      1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
> on each bound memslot overlapping with the range
>       2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
> actually unmaps the host mappings which will unmap the KVM non-seucure
> EPT/NPT entries.
>           -> guest_memfd invokes IOMMU invalidation callback to zap the
> non-secure IOMMU entries.
>       3) guest_memfd invokes kvm_gmem_execute_work() which updates the
> shareability and then merges the folios if needed.
>       4) Userspace invokes IOMMU map operation to map the ranges in secure IOMMU.


Alright (although this zap+map is not necessary on the AMD hw).


> There should be a way to block external IOMMU pagetable updates while
> guest_memfd is performing conversion e.g. something like
> kvm_invalidate_begin()/end().
> 
>>> is going to be flushed on a page conversion anyway (the RMPUPDATE
>>> instruction does that). All this is about AMD's x86 though.
>>
>> The iommu should not be using the VMA to manage the mapping. It should
> 
> +1.

Yeah, not doing this already, because I physically cannot map gmemfd's memory in IOMMU via VMA (which allocates memory via gup() so wrong memory is mapped in IOMMU). Thanks,


>> be directly linked to the guestmemfd in some way that does not disturb
>> its operations. I imagine there would be some kind of invalidation
>> callback directly to the iommu.
>>
>> Presumably that invalidation call back can include a reason for the
>> invalidation (addr change, shared/private conversion, etc)
>>
>> I'm not sure how we will figure out which case is which but guestmemfd
>> should allow the iommu to plug in either invalidation scheme..
>>
>> Probably invalidation should be a global to the FD thing, I imagine
>> that once invalidation is established the iommu will not be
>> incrementing page refcounts.
> 
> +1.

Alright. Thanks for the comments.

> 
>>
>> Jason

-- 
Alexey

Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Vishal Annapurve 5 months, 3 weeks ago
On Thu, Jun 26, 2025 at 9:50 PM Alexey Kardashevskiy <aik@amd.com> wrote:
>
>
>
> On 25/6/25 00:10, Vishal Annapurve wrote:
> > On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >>
> >> On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
> >>
> >>> Now, I am rebasing my RFC on top of this patchset and it fails in
> >>> kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
> >>> folios in my RFC.
> >>>
> >>> So what is the expected sequence here? The userspace unmaps a DMA
> >>> page and maps it back right away, all from the userspace? The end
> >>> result will be the exactly same which seems useless. And IOMMU TLB
> >
> >   As Jason described, ideally IOMMU just like KVM, should just:
> > 1) Directly rely on guest_memfd for pinning -> no page refcounts taken
> > by IOMMU stack
> > 2) Directly query pfns from guest_memfd for both shared/private ranges
> > 3) Implement an invalidation callback that guest_memfd can invoke on
> > conversions.

Conversions and truncations both.

> >
> > Current flow:
> > Private to Shared conversion via kvm_gmem_convert_range() -
> >      1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
> > on each bound memslot overlapping with the range
> >           -> KVM has the concept of invalidation_begin() and end(),
> > which effectively ensures that between these function calls, no new
> > EPT/NPT entries can be added for the range.
> >       2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
> > actually unmaps the KVM SEPT/NPT entries.
> >       3) guest_memfd invokes kvm_gmem_execute_work() which updates the
> > shareability and then splits the folios if needed
> >
> > Shared to private conversion via kvm_gmem_convert_range() -
> >      1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
> > on each bound memslot overlapping with the range
> >       2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
> > actually unmaps the host mappings which will unmap the KVM non-seucure
> > EPT/NPT entries.
> >       3) guest_memfd invokes kvm_gmem_execute_work() which updates the
> > shareability and then merges the folios if needed.
> >
> > ============================
> >
> > For IOMMU, could something like below work?
> >
> > * A new UAPI to bind IOMMU FDs with guest_memfd ranges
>
> Done that.
>
> > * VFIO_DMA_MAP/UNMAP operations modified to directly fetch pfns from
> > guest_memfd ranges using kvm_gmem_get_pfn()
>
> This API imho should drop the confusing kvm_ prefix.
>
> >      -> kvm invokes kvm_gmem_is_private() to check for the range
> > shareability, IOMMU could use the same or we could add an API in gmem
> > that takes in access type and checks the shareability before returning
> > the pfn.
>
> Right now I cutnpasted kvm_gmem_get_folio() (which essentially is filemap_lock_folio()/filemap_alloc_folio()/__filemap_add_folio()) to avoid new links between iommufd.ko and kvm.ko. It is probably unavoidable though.

I don't think that's the way to avoid links between iommufd.ko and
kvm.ko. Cleaner way probably is to have gmem logic built-in and allow
runtime registration of invalidation callbacks from KVM/IOMMU
backends. Need to think about this more.

>
>
> > * IOMMU stack exposes an invalidation callback that can be invoked by
> > guest_memfd.
> >
> > Private to Shared conversion via kvm_gmem_convert_range() -
> >      1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
> > on each bound memslot overlapping with the range
> >       2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
> > actually unmaps the KVM SEPT/NPT entries.
> >             -> guest_memfd invokes IOMMU invalidation callback to zap
> > the secure IOMMU entries.
> >       3) guest_memfd invokes kvm_gmem_execute_work() which updates the
> > shareability and then splits the folios if needed
> >       4) Userspace invokes IOMMU map operation to map the ranges in
> > non-secure IOMMU.
> >
> > Shared to private conversion via kvm_gmem_convert_range() -
> >      1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
> > on each bound memslot overlapping with the range
> >       2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
> > actually unmaps the host mappings which will unmap the KVM non-seucure
> > EPT/NPT entries.
> >           -> guest_memfd invokes IOMMU invalidation callback to zap the
> > non-secure IOMMU entries.
> >       3) guest_memfd invokes kvm_gmem_execute_work() which updates the
> > shareability and then merges the folios if needed.
> >       4) Userspace invokes IOMMU map operation to map the ranges in secure IOMMU.
>
>
> Alright (although this zap+map is not necessary on the AMD hw).

IMO guest_memfd ideally should not directly interact or cater to arch
specific needs, it should implement a mechanism that works for all
archs. KVM/IOMMU implement invalidation callbacks and have all the
architecture specific knowledge to take the right decisions.

>
>
> > There should be a way to block external IOMMU pagetable updates while
> > guest_memfd is performing conversion e.g. something like
> > kvm_invalidate_begin()/end().
> >
> >>> is going to be flushed on a page conversion anyway (the RMPUPDATE
> >>> instruction does that). All this is about AMD's x86 though.
> >>
> >> The iommu should not be using the VMA to manage the mapping. It should
> >
> > +1.
>
> Yeah, not doing this already, because I physically cannot map gmemfd's memory in IOMMU via VMA (which allocates memory via gup() so wrong memory is mapped in IOMMU). Thanks,
>
>
> >> be directly linked to the guestmemfd in some way that does not disturb
> >> its operations. I imagine there would be some kind of invalidation
> >> callback directly to the iommu.
> >>
> >> Presumably that invalidation call back can include a reason for the
> >> invalidation (addr change, shared/private conversion, etc)
> >>
> >> I'm not sure how we will figure out which case is which but guestmemfd
> >> should allow the iommu to plug in either invalidation scheme..
> >>
> >> Probably invalidation should be a global to the FD thing, I imagine
> >> that once invalidation is established the iommu will not be
> >> incrementing page refcounts.
> >
> > +1.
>
> Alright. Thanks for the comments.
>
> >
> >>
> >> Jason
>
> --
> Alexey
>
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Alexey Kardashevskiy 5 months, 3 weeks ago

On 28/6/25 01:17, Vishal Annapurve wrote:
> On Thu, Jun 26, 2025 at 9:50 PM Alexey Kardashevskiy <aik@amd.com> wrote:
>>
>>
>>
>> On 25/6/25 00:10, Vishal Annapurve wrote:
>>> On Tue, Jun 24, 2025 at 6:08 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>>>>
>>>> On Tue, Jun 24, 2025 at 06:23:54PM +1000, Alexey Kardashevskiy wrote:
>>>>
>>>>> Now, I am rebasing my RFC on top of this patchset and it fails in
>>>>> kvm_gmem_has_safe_refcount() as IOMMU holds references to all these
>>>>> folios in my RFC.
>>>>>
>>>>> So what is the expected sequence here? The userspace unmaps a DMA
>>>>> page and maps it back right away, all from the userspace? The end
>>>>> result will be the exactly same which seems useless. And IOMMU TLB
>>>
>>>    As Jason described, ideally IOMMU just like KVM, should just:
>>> 1) Directly rely on guest_memfd for pinning -> no page refcounts taken
>>> by IOMMU stack
>>> 2) Directly query pfns from guest_memfd for both shared/private ranges
>>> 3) Implement an invalidation callback that guest_memfd can invoke on
>>> conversions.
> 
> Conversions and truncations both.
> 
>>>
>>> Current flow:
>>> Private to Shared conversion via kvm_gmem_convert_range() -
>>>       1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
>>> on each bound memslot overlapping with the range
>>>            -> KVM has the concept of invalidation_begin() and end(),
>>> which effectively ensures that between these function calls, no new
>>> EPT/NPT entries can be added for the range.
>>>        2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
>>> actually unmaps the KVM SEPT/NPT entries.
>>>        3) guest_memfd invokes kvm_gmem_execute_work() which updates the
>>> shareability and then splits the folios if needed
>>>
>>> Shared to private conversion via kvm_gmem_convert_range() -
>>>       1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
>>> on each bound memslot overlapping with the range
>>>        2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
>>> actually unmaps the host mappings which will unmap the KVM non-seucure
>>> EPT/NPT entries.
>>>        3) guest_memfd invokes kvm_gmem_execute_work() which updates the
>>> shareability and then merges the folios if needed.
>>>
>>> ============================
>>>
>>> For IOMMU, could something like below work?
>>>
>>> * A new UAPI to bind IOMMU FDs with guest_memfd ranges
>>
>> Done that.
>>
>>> * VFIO_DMA_MAP/UNMAP operations modified to directly fetch pfns from
>>> guest_memfd ranges using kvm_gmem_get_pfn()
>>
>> This API imho should drop the confusing kvm_ prefix.
>>
>>>       -> kvm invokes kvm_gmem_is_private() to check for the range
>>> shareability, IOMMU could use the same or we could add an API in gmem
>>> that takes in access type and checks the shareability before returning
>>> the pfn.
>>
>> Right now I cutnpasted kvm_gmem_get_folio() (which essentially is filemap_lock_folio()/filemap_alloc_folio()/__filemap_add_folio()) to avoid new links between iommufd.ko and kvm.ko. It is probably unavoidable though.
> 
> I don't think that's the way to avoid links between iommufd.ko and
> kvm.ko. Cleaner way probably is to have gmem logic built-in and allow
> runtime registration of invalidation callbacks from KVM/IOMMU
> backends. Need to think about this more.

Yeah, otherwise iommufd.ko will have to install a hook in guest_memfd (==kvm.ko) in run time so more beloved symbol_get() :)

> 
>>
>>
>>> * IOMMU stack exposes an invalidation callback that can be invoked by
>>> guest_memfd.
>>>
>>> Private to Shared conversion via kvm_gmem_convert_range() -
>>>       1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
>>> on each bound memslot overlapping with the range
>>>        2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
>>> actually unmaps the KVM SEPT/NPT entries.
>>>              -> guest_memfd invokes IOMMU invalidation callback to zap
>>> the secure IOMMU entries.
>>>        3) guest_memfd invokes kvm_gmem_execute_work() which updates the
>>> shareability and then splits the folios if needed
>>>        4) Userspace invokes IOMMU map operation to map the ranges in
>>> non-secure IOMMU.
>>>
>>> Shared to private conversion via kvm_gmem_convert_range() -
>>>       1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
>>> on each bound memslot overlapping with the range
>>>        2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
>>> actually unmaps the host mappings which will unmap the KVM non-seucure
>>> EPT/NPT entries.
>>>            -> guest_memfd invokes IOMMU invalidation callback to zap the
>>> non-secure IOMMU entries.
>>>        3) guest_memfd invokes kvm_gmem_execute_work() which updates the
>>> shareability and then merges the folios if needed.
>>>        4) Userspace invokes IOMMU map operation to map the ranges in secure IOMMU.
>>
>>
>> Alright (although this zap+map is not necessary on the AMD hw).
> 
> IMO guest_memfd ideally should not directly interact or cater to arch
> specific needs, it should implement a mechanism that works for all
> archs. KVM/IOMMU implement invalidation callbacks and have all the
> architecture specific knowledge to take the right decisions.


Every page conversion will go through:

kvm-amd.ko -1-> guest_memfd (kvm.ko) -2-> iommufd.ko -3-> amd-iommu (build-in).

Which one decides on IOMMU not needing (un)mapping? Got to be (1) but then it need to propagate the decision to amd-iommu (and we do not have (3) at the moment in that path).

Or we just always do unmap+map (and trigger unwanted page huge page smashing)? All is doable and neither particularly horrible, I'm trying to see where the consensus is now. Thanks,


>>
>>> There should be a way to block external IOMMU pagetable updates while
>>> guest_memfd is performing conversion e.g. something like
>>> kvm_invalidate_begin()/end().
>>>
>>>>> is going to be flushed on a page conversion anyway (the RMPUPDATE
>>>>> instruction does that). All this is about AMD's x86 though.
>>>>
>>>> The iommu should not be using the VMA to manage the mapping. It should
>>>
>>> +1.
>>
>> Yeah, not doing this already, because I physically cannot map gmemfd's memory in IOMMU via VMA (which allocates memory via gup() so wrong memory is mapped in IOMMU). Thanks,
>>
>>
>>>> be directly linked to the guestmemfd in some way that does not disturb
>>>> its operations. I imagine there would be some kind of invalidation
>>>> callback directly to the iommu.
>>>>
>>>> Presumably that invalidation call back can include a reason for the
>>>> invalidation (addr change, shared/private conversion, etc)
>>>>
>>>> I'm not sure how we will figure out which case is which but guestmemfd
>>>> should allow the iommu to plug in either invalidation scheme..
>>>>
>>>> Probably invalidation should be a global to the FD thing, I imagine
>>>> that once invalidation is established the iommu will not be
>>>> incrementing page refcounts.
>>>
>>> +1.
>>
>> Alright. Thanks for the comments.
>>
>>>
>>>>
>>>> Jason
>>
>> --
>> Alexey
>>

-- 
Alexey

Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Vishal Annapurve 5 months, 2 weeks ago
On Sun, Jun 29, 2025 at 5:19 PM Alexey Kardashevskiy <aik@amd.com> wrote:
> ...
> >>> ============================
> >>>
> >>> For IOMMU, could something like below work?
> >>>
> >>> * A new UAPI to bind IOMMU FDs with guest_memfd ranges
> >>
> >> Done that.
> >>
> >>> * VFIO_DMA_MAP/UNMAP operations modified to directly fetch pfns from
> >>> guest_memfd ranges using kvm_gmem_get_pfn()
> >>
> >> This API imho should drop the confusing kvm_ prefix.
> >>
> >>>       -> kvm invokes kvm_gmem_is_private() to check for the range
> >>> shareability, IOMMU could use the same or we could add an API in gmem
> >>> that takes in access type and checks the shareability before returning
> >>> the pfn.
> >>
> >> Right now I cutnpasted kvm_gmem_get_folio() (which essentially is filemap_lock_folio()/filemap_alloc_folio()/__filemap_add_folio()) to avoid new links between iommufd.ko and kvm.ko. It is probably unavoidable though.
> >
> > I don't think that's the way to avoid links between iommufd.ko and
> > kvm.ko. Cleaner way probably is to have gmem logic built-in and allow
> > runtime registration of invalidation callbacks from KVM/IOMMU
> > backends. Need to think about this more.
>
> Yeah, otherwise iommufd.ko will have to install a hook in guest_memfd (==kvm.ko) in run time so more beloved symbol_get() :)
>
> >
> >>
> >>
> >>> * IOMMU stack exposes an invalidation callback that can be invoked by
> >>> guest_memfd.
> >>>
> >>> Private to Shared conversion via kvm_gmem_convert_range() -
> >>>       1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
> >>> on each bound memslot overlapping with the range
> >>>        2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
> >>> actually unmaps the KVM SEPT/NPT entries.
> >>>              -> guest_memfd invokes IOMMU invalidation callback to zap
> >>> the secure IOMMU entries.
> >>>        3) guest_memfd invokes kvm_gmem_execute_work() which updates the
> >>> shareability and then splits the folios if needed
> >>>        4) Userspace invokes IOMMU map operation to map the ranges in
> >>> non-secure IOMMU.
> >>>
> >>> Shared to private conversion via kvm_gmem_convert_range() -
> >>>       1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
> >>> on each bound memslot overlapping with the range
> >>>        2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
> >>> actually unmaps the host mappings which will unmap the KVM non-seucure
> >>> EPT/NPT entries.
> >>>            -> guest_memfd invokes IOMMU invalidation callback to zap the
> >>> non-secure IOMMU entries.
> >>>        3) guest_memfd invokes kvm_gmem_execute_work() which updates the
> >>> shareability and then merges the folios if needed.
> >>>        4) Userspace invokes IOMMU map operation to map the ranges in secure IOMMU.
> >>
> >>
> >> Alright (although this zap+map is not necessary on the AMD hw).
> >
> > IMO guest_memfd ideally should not directly interact or cater to arch
> > specific needs, it should implement a mechanism that works for all
> > archs. KVM/IOMMU implement invalidation callbacks and have all the
> > architecture specific knowledge to take the right decisions.
>
>
> Every page conversion will go through:
>
> kvm-amd.ko -1-> guest_memfd (kvm.ko) -2-> iommufd.ko -3-> amd-iommu (build-in).
>
> Which one decides on IOMMU not needing (un)mapping? Got to be (1) but then it need to propagate the decision to amd-iommu (and we do not have (3) at the moment in that path).

If there is a need, guest_memfd can support two different callbacks:
1) Conversion notifier/callback invoked by guest_memfd during
conversion handling.
2) Invalidation notifier/callback invoked by guest_memfd during truncation.

Iommufd/kvm can handle conversion callback/notifier as per the needs
of underlying architecture. e.g. for TDX connect do the unmapping vs
for SEV Trusted IO skip the unmapping.

Invalidation callback/notifier will need to be handled by unmapping page tables.

>
> Or we just always do unmap+map (and trigger unwanted page huge page smashing)? All is doable and neither particularly horrible, I'm trying to see where the consensus is now. Thanks,
>

I assume when you say huge page smashing, it means huge page NPT
mapping getting split.

AFAIR, based on discussion with Michael during guest_memfd calls,
stage2 NPT entries need to be of the same granularity as RMP tables
for AMD SNP guests. i.e. huge page NPT mappings need to be smashed on
the KVM side during conversion. So today guest_memfd sends
invalidation notification to KVM for both conversion and truncation.
Doesn't the same constraint for keeping IOMMU page tables at the same
granularity as RMP tables hold for trusted IO?
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Alexey Kardashevskiy 5 months, 1 week ago

On 1/7/25 00:19, Vishal Annapurve wrote:
> On Sun, Jun 29, 2025 at 5:19 PM Alexey Kardashevskiy <aik@amd.com> wrote:
>> ...
>>>>> ============================
>>>>>
>>>>> For IOMMU, could something like below work?
>>>>>
>>>>> * A new UAPI to bind IOMMU FDs with guest_memfd ranges
>>>>
>>>> Done that.
>>>>
>>>>> * VFIO_DMA_MAP/UNMAP operations modified to directly fetch pfns from
>>>>> guest_memfd ranges using kvm_gmem_get_pfn()
>>>>
>>>> This API imho should drop the confusing kvm_ prefix.
>>>>
>>>>>        -> kvm invokes kvm_gmem_is_private() to check for the range
>>>>> shareability, IOMMU could use the same or we could add an API in gmem
>>>>> that takes in access type and checks the shareability before returning
>>>>> the pfn.
>>>>
>>>> Right now I cutnpasted kvm_gmem_get_folio() (which essentially is filemap_lock_folio()/filemap_alloc_folio()/__filemap_add_folio()) to avoid new links between iommufd.ko and kvm.ko. It is probably unavoidable though.
>>>
>>> I don't think that's the way to avoid links between iommufd.ko and
>>> kvm.ko. Cleaner way probably is to have gmem logic built-in and allow
>>> runtime registration of invalidation callbacks from KVM/IOMMU
>>> backends. Need to think about this more.
>>
>> Yeah, otherwise iommufd.ko will have to install a hook in guest_memfd (==kvm.ko) in run time so more beloved symbol_get() :)
>>
>>>
>>>>
>>>>
>>>>> * IOMMU stack exposes an invalidation callback that can be invoked by
>>>>> guest_memfd.
>>>>>
>>>>> Private to Shared conversion via kvm_gmem_convert_range() -
>>>>>        1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
>>>>> on each bound memslot overlapping with the range
>>>>>         2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
>>>>> actually unmaps the KVM SEPT/NPT entries.
>>>>>               -> guest_memfd invokes IOMMU invalidation callback to zap
>>>>> the secure IOMMU entries.
>>>>>         3) guest_memfd invokes kvm_gmem_execute_work() which updates the
>>>>> shareability and then splits the folios if needed
>>>>>         4) Userspace invokes IOMMU map operation to map the ranges in
>>>>> non-secure IOMMU.
>>>>>
>>>>> Shared to private conversion via kvm_gmem_convert_range() -
>>>>>        1) guest_memfd invokes kvm_gmem_invalidate_begin() for the ranges
>>>>> on each bound memslot overlapping with the range
>>>>>         2) guest_memfd invokes kvm_gmem_convert_should_proceed() which
>>>>> actually unmaps the host mappings which will unmap the KVM non-seucure
>>>>> EPT/NPT entries.
>>>>>             -> guest_memfd invokes IOMMU invalidation callback to zap the
>>>>> non-secure IOMMU entries.
>>>>>         3) guest_memfd invokes kvm_gmem_execute_work() which updates the
>>>>> shareability and then merges the folios if needed.
>>>>>         4) Userspace invokes IOMMU map operation to map the ranges in secure IOMMU.
>>>>
>>>>
>>>> Alright (although this zap+map is not necessary on the AMD hw).
>>>
>>> IMO guest_memfd ideally should not directly interact or cater to arch
>>> specific needs, it should implement a mechanism that works for all
>>> archs. KVM/IOMMU implement invalidation callbacks and have all the
>>> architecture specific knowledge to take the right decisions.
>>
>>
>> Every page conversion will go through:
>>
>> kvm-amd.ko -1-> guest_memfd (kvm.ko) -2-> iommufd.ko -3-> amd-iommu (build-in).
>>
>> Which one decides on IOMMU not needing (un)mapping? Got to be (1) but then it need to propagate the decision to amd-iommu (and we do not have (3) at the moment in that path).
> 
> If there is a need, guest_memfd can support two different callbacks:
> 1) Conversion notifier/callback invoked by guest_memfd during
> conversion handling.
> 2) Invalidation notifier/callback invoked by guest_memfd during truncation.
> 
> Iommufd/kvm can handle conversion callback/notifier as per the needs
> of underlying architecture. e.g. for TDX connect do the unmapping vs
> for SEV Trusted IO skip the unmapping.
> 
> Invalidation callback/notifier will need to be handled by unmapping page tables.
> 
>>
>> Or we just always do unmap+map (and trigger unwanted page huge page smashing)? All is doable and neither particularly horrible, I'm trying to see where the consensus is now. Thanks,
>>
> 
> I assume when you say huge page smashing, it means huge page NPT
> mapping getting split.
> 
> AFAIR, based on discussion with Michael during guest_memfd calls,
> stage2 NPT entries need to be of the same granularity as RMP tables
> for AMD SNP guests. i.e. huge page NPT mappings need to be smashed on
> the KVM side during conversion. So today guest_memfd sends
> invalidation notification to KVM for both conversion and truncation.
> Doesn't the same constraint for keeping IOMMU page tables at the same
> granularity as RMP tables hold for trusted IO?


Currently I handle this from the KVM with a hack to get IOPDE from AMD IOMMU so both 2MB RMP entry and IOPDE entries are smashed in one go in one of many firmwares running on EPYC, and atm this is too hacky to be posted even as an RFC. This likely needs to move to IOMMUFD then (via some callbacks) which could call AMD IOMMU which then would call that firmware (called "TMPM" and it is not the PSP which is "TSM), probably. Thanks,



-- 
Alexey

Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Jason Gunthorpe 5 months, 1 week ago
On Thu, Jul 10, 2025 at 04:57:25PM +1000, Alexey Kardashevskiy wrote:

> Currently I handle this from the KVM with a hack to get IOPDE from
> AMD IOMMU so both 2MB RMP entry and IOPDE entries are smashed in one
> go in one of many firmwares running on EPYC, and atm this is too
> hacky to be posted even as an RFC. This likely needs to move to
> IOMMUFD then (via some callbacks) which could call AMD IOMMU which
> then would call that firmware (called "TMPM" and it is not the PSP
> which is "TSM), probably. Thanks,

Wasn't the issue with the iommu that it needed to have a PTE break
whenever the shared/private changed in the RMP? Because the HW can't
handle an IOPTE that crosses more than one RMP entry? Or do I
misunderstand the problem?

If this is the problem I was expecting the page table code that
translates the guest memfd into the iommu PTEs would respect the
shared/private conversion boundaries and break up the PTEs
automatically.

I had thought there were three versions of of how to copy from guest
memfd into the IOPTEs:
 - HW must never have a private physaddr in an IOPTE
 - HW must have IOPTEs entirely private or shared
 - HW handles everything and IOPTEs should be maximally sized

Is this right? Is AMD #2?

Jason
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Fuad Tabba 7 months ago
Hi Vishal,

On Tue, 20 May 2025 at 15:11, Vishal Annapurve <vannapurve@google.com> wrote:
>
> On Tue, May 20, 2025 at 6:44 AM Fuad Tabba <tabba@google.com> wrote:
> >
> > Hi Vishal,
> >
> > On Tue, 20 May 2025 at 14:02, Vishal Annapurve <vannapurve@google.com> wrote:
> > >
> > > On Tue, May 20, 2025 at 2:23 AM Fuad Tabba <tabba@google.com> wrote:
> > > >
> > > > Hi Ackerley,
> > > >
> > > > On Thu, 15 May 2025 at 00:43, Ackerley Tng <ackerleytng@google.com> wrote:
> > > > >
> > > > > The two new guest_memfd ioctls KVM_GMEM_CONVERT_SHARED and
> > > > > KVM_GMEM_CONVERT_PRIVATE convert the requested memory ranges to shared
> > > > > and private respectively.
> > > >
> > > > I have a high level question about this particular patch and this
> > > > approach for conversion: why do we need IOCTLs to manage conversion
> > > > between private and shared?
> > > >
> > > > In the presentations I gave at LPC [1, 2], and in my latest patch
> > > > series that performs in-place conversion [3] and the associated (by
> > > > now outdated) state diagram [4], I didn't see the need to have a
> > > > userspace-facing interface to manage that. KVM has all the information
> > > > it needs to handle conversions, which are triggered by the guest. To
> > > > me this seems like it adds additional complexity, as well as a user
> > > > facing interface that we would need to maintain.
> > > >
> > > > There are various ways we could handle conversion without explicit
> > > > interference from userspace. What I had in mind is the following (as
> > > > an example, details can vary according to VM type). I will use use the
> > > > case of conversion from shared to private because that is the more
> > > > complicated (interesting) case:
> > > >
> > > > - Guest issues a hypercall to request that a shared folio become private.
> > > >
> > > > - The hypervisor receives the call, and passes it to KVM.
> > > >
> > > > - KVM unmaps the folio from the guest stage-2 (EPT I think in x86
> > > > parlance), and unmaps it from the host. The host however, could still
> > > > have references (e.g., GUP).
> > > >
> > > > - KVM exits to the host (hypervisor call exit), with the information
> > > > that the folio has been unshared from it.
> > > >
> > > > - A well behaving host would now get rid of all of its references
> > > > (e.g., release GUPs), perform a VCPU run, and the guest continues
> > > > running as normal. I expect this to be the common case.
> > > >
> > > > But to handle the more interesting situation, let's say that the host
> > > > doesn't do it immediately, and for some reason it holds on to some
> > > > references to that folio.
> > > >
> > > > - Even if that's the case, the guest can still run *. If the guest
> > > > tries to access the folio, KVM detects that access when it tries to
> > > > fault it into the guest, sees that the host still has references to
> > > > that folio, and exits back to the host with a memory fault exit. At
> > > > this point, the VCPU that has tried to fault in that particular folio
> > > > cannot continue running as long as it cannot fault in that folio.
> > >
> > > Are you talking about the following scheme?
> > > 1) guest_memfd checks shareability on each get pfn and if there is a
> > > mismatch exit to the host.
> >
> > I think we are not really on the same page here (no pun intended :) ).
> > I'll try to answer your questions anyway...
> >
> > Which get_pfn? Are you referring to get_pfn when faulting the page
> > into the guest or into the host?
>
> I am referring to guest fault handling in KVM.
>
> >
> > > 2) host user space has to guess whether it's a pending refcount or
> > > whether it's an actual mismatch.
> >
> > No need to guess. VCPU run will let it know exactly why it's exiting.
> >
> > > 3) guest_memfd will maintain a third state
> > > "pending_private_conversion" or equivalent which will transition to
> > > private upon the last refcount drop of each page.
> > >
> > > If conversion is triggered by userspace (in case of pKVM, it will be
> > > triggered from within the KVM (?)):
> >
> > Why would conversion be triggered by userspace? As far as I know, it's
> > the guest that triggers the conversion.
> >
> > > * Conversion will just fail if there are extra refcounts and userspace
> > > can try to get rid of extra refcounts on the range while it has enough
> > > context without hitting any ambiguity with memory fault exit.
> > > * guest_memfd will not have to deal with this extra state from 3 above
> > > and overall guest_memfd conversion handling becomes relatively
> > > simpler.
> >
> > That's not really related. The extra state isn't necessary any more
> > once we agreed in the previous discussion that we will retry instead.
>
> Who is *we* here? Which entity will retry conversion?

Userspace will re-attempt the VCPU run.

> >
> > > Note that for x86 CoCo cases, memory conversion is already triggered
> > > by userspace using KVM ioctl, this series is proposing to use
> > > guest_memfd ioctl to do the same.
> >
> > The reason why for x86 CoCo cases conversion is already triggered by
> > userspace using KVM ioctl is that it has to, since shared memory and
> > private memory are two separate pages, and userspace needs to manage
> > that. Sharing memory in place removes the need for that.
>
> Userspace still needs to clean up memory usage before conversion is
> successful. e.g. remove IOMMU mappings for shared to private
> conversion. I would think that memory conversion should not succeed
> before all existing users let go of the guest_memfd pages for the
> range being converted.

Yes. Userspace will know that it needs to do that on the VCPU exit,
which informs it of the guest's hypervisor request to unshare (convert
from shared to private) the page.

> In x86 CoCo usecases, userspace can also decide to not allow
> conversion for scenarios where ranges are still under active use by
> the host and guest is erroneously trying to take away memory. Both
> SNP/TDX spec allow failure of conversion due to in use memory.

How can the guest erroneously try to take away memory? If the guest
sends a hypervisor request asking for a conversion of memory that
doesn't belong to it, then I would expect the hypervisor to prevent
that.

I don't see how having an IOCTL to trigger the conversion is needed to
allow conversion failure. How is that different from userspace
ignoring or delaying releasing all references it has for the
conversion request?

> >
> > This series isn't using the same ioctl, it's introducing new ones to
> > perform a task that as far as I can tell so far, KVM can handle by
> > itself.
>
> I would like to understand this better. How will KVM handle the
> conversion process for guest_memfd pages? Can you help walk an example
> sequence for shared to private conversion specifically around
> guest_memfd offset states?

To make sure that we are discussing the same scenario: can you do the
same as well please --- walk me through an example sequence for shared
to private conversion specifically around guest_memfd offset states
With the IOCTLs involved?

Here is an example that I have implemented and tested with pKVM. Note
that there are alternatives, the flow below is architecture or even
vm-type dependent. None of this code is code KVM code and the
behaviour could vary.


Assuming the folio is shared with the host:

Guest sends unshare hypercall to the hypervisor
Hypervisor forwards request to KVM (gmem) (having done due diligence)
KVM (gmem) performs an unmap_folio(), exits to userspace with
KVM_EXIT_UNSHARE and all the information about the folio being
unshared

Case 1:
Userspace removes any remaining references (GUPs, IOMMU Mappings etc...)
Userspace calls vcpu_run(): KVM (gmem) sees that there aren't any
references, sets state to PRIVATE

Case 2 (alternative 1):
Userspace doesn't release its references
Userspace calls vcpu_run(): KVM (gmem) sees that there are still
references, exits back to userspace with KVM_EXIT_UNSHARE

Case 2 (alternative 2):
Userspace doesn't release its references
Userspace calls vcpu_run(): KVM (gmem) sees that there are still
references, unmaps folio from guest, but allows it to run (until it
tries to fault in the folio)
Guest tries to fault in folio that still has reference, KVM does not
allow that (it sees that the folio is shared, and it doesn't fault in
shared folios to confidential guests)
KVM exits back to userspace with KVM_EXIT_UNSHARE

As I mentioned, the alternatives above are _not_ set in core KVM code.
They can vary by architecture of VM type, depending on the policy,
support, etc..

Now for your example please on how this would work with IOCTLs :)

Thanks,
/fuad

> >
> > >  - Allows not having to keep track of separate shared/private range
> > > information in KVM.
> >
> > This patch series is already tracking shared/private range information in KVM.
> >
> > >  - Simpler handling of the conversion process done per guest_memfd
> > > rather than for full range.
> > >      - Userspace can handle the rollback as needed, simplifying error
> > > handling in guest_memfd.
> > >  - guest_memfd is single source of truth and notifies the users of
> > > shareability change.
> > >      - e.g. IOMMU, userspace, KVM MMU all can be registered for
> > > getting notifications from guest_memfd directly and will get notified
> > > for invalidation upon shareability attribute updates.
> >
> > All of these can still be done without introducing a new ioctl.
> >
> > Cheers,
> > /fuad
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Vishal Annapurve 7 months ago
On Tue, May 20, 2025 at 7:34 AM Fuad Tabba <tabba@google.com> wrote:
>
> Hi Vishal,
>
> On Tue, 20 May 2025 at 15:11, Vishal Annapurve <vannapurve@google.com> wrote:
> >
> > On Tue, May 20, 2025 at 6:44 AM Fuad Tabba <tabba@google.com> wrote:
> > >
> > > Hi Vishal,
> > >
> > > On Tue, 20 May 2025 at 14:02, Vishal Annapurve <vannapurve@google.com> wrote:
> > > >
> > > > On Tue, May 20, 2025 at 2:23 AM Fuad Tabba <tabba@google.com> wrote:
> > > > >
> > > > > Hi Ackerley,
> > > > >
> > > > > On Thu, 15 May 2025 at 00:43, Ackerley Tng <ackerleytng@google.com> wrote:
> > > > > >
> > > > > > The two new guest_memfd ioctls KVM_GMEM_CONVERT_SHARED and
> > > > > > KVM_GMEM_CONVERT_PRIVATE convert the requested memory ranges to shared
> > > > > > and private respectively.
> > > > >
> > > > > I have a high level question about this particular patch and this
> > > > > approach for conversion: why do we need IOCTLs to manage conversion
> > > > > between private and shared?
> > > > >
> > > > > In the presentations I gave at LPC [1, 2], and in my latest patch
> > > > > series that performs in-place conversion [3] and the associated (by
> > > > > now outdated) state diagram [4], I didn't see the need to have a
> > > > > userspace-facing interface to manage that. KVM has all the information
> > > > > it needs to handle conversions, which are triggered by the guest. To
> > > > > me this seems like it adds additional complexity, as well as a user
> > > > > facing interface that we would need to maintain.
> > > > >
> > > > > There are various ways we could handle conversion without explicit
> > > > > interference from userspace. What I had in mind is the following (as
> > > > > an example, details can vary according to VM type). I will use use the
> > > > > case of conversion from shared to private because that is the more
> > > > > complicated (interesting) case:
> > > > >
> > > > > - Guest issues a hypercall to request that a shared folio become private.
> > > > >
> > > > > - The hypervisor receives the call, and passes it to KVM.
> > > > >
> > > > > - KVM unmaps the folio from the guest stage-2 (EPT I think in x86
> > > > > parlance), and unmaps it from the host. The host however, could still
> > > > > have references (e.g., GUP).
> > > > >
> > > > > - KVM exits to the host (hypervisor call exit), with the information
> > > > > that the folio has been unshared from it.
> > > > >
> > > > > - A well behaving host would now get rid of all of its references
> > > > > (e.g., release GUPs), perform a VCPU run, and the guest continues
> > > > > running as normal. I expect this to be the common case.
> > > > >
> > > > > But to handle the more interesting situation, let's say that the host
> > > > > doesn't do it immediately, and for some reason it holds on to some
> > > > > references to that folio.
> > > > >
> > > > > - Even if that's the case, the guest can still run *. If the guest
> > > > > tries to access the folio, KVM detects that access when it tries to
> > > > > fault it into the guest, sees that the host still has references to
> > > > > that folio, and exits back to the host with a memory fault exit. At
> > > > > this point, the VCPU that has tried to fault in that particular folio
> > > > > cannot continue running as long as it cannot fault in that folio.
> > > >
> > > > Are you talking about the following scheme?
> > > > 1) guest_memfd checks shareability on each get pfn and if there is a
> > > > mismatch exit to the host.
> > >
> > > I think we are not really on the same page here (no pun intended :) ).
> > > I'll try to answer your questions anyway...
> > >
> > > Which get_pfn? Are you referring to get_pfn when faulting the page
> > > into the guest or into the host?
> >
> > I am referring to guest fault handling in KVM.
> >
> > >
> > > > 2) host user space has to guess whether it's a pending refcount or
> > > > whether it's an actual mismatch.
> > >
> > > No need to guess. VCPU run will let it know exactly why it's exiting.
> > >
> > > > 3) guest_memfd will maintain a third state
> > > > "pending_private_conversion" or equivalent which will transition to
> > > > private upon the last refcount drop of each page.
> > > >
> > > > If conversion is triggered by userspace (in case of pKVM, it will be
> > > > triggered from within the KVM (?)):
> > >
> > > Why would conversion be triggered by userspace? As far as I know, it's
> > > the guest that triggers the conversion.
> > >
> > > > * Conversion will just fail if there are extra refcounts and userspace
> > > > can try to get rid of extra refcounts on the range while it has enough
> > > > context without hitting any ambiguity with memory fault exit.
> > > > * guest_memfd will not have to deal with this extra state from 3 above
> > > > and overall guest_memfd conversion handling becomes relatively
> > > > simpler.
> > >
> > > That's not really related. The extra state isn't necessary any more
> > > once we agreed in the previous discussion that we will retry instead.
> >
> > Who is *we* here? Which entity will retry conversion?
>
> Userspace will re-attempt the VCPU run.

Then KVM will have to keep track of the ranges that need conversion
across exits. I think it's cleaner to let userspace make the decision
and invoke conversion without carrying additional state in KVM about
guest request.

>
> > >
> > > > Note that for x86 CoCo cases, memory conversion is already triggered
> > > > by userspace using KVM ioctl, this series is proposing to use
> > > > guest_memfd ioctl to do the same.
> > >
> > > The reason why for x86 CoCo cases conversion is already triggered by
> > > userspace using KVM ioctl is that it has to, since shared memory and
> > > private memory are two separate pages, and userspace needs to manage
> > > that. Sharing memory in place removes the need for that.
> >
> > Userspace still needs to clean up memory usage before conversion is
> > successful. e.g. remove IOMMU mappings for shared to private
> > conversion. I would think that memory conversion should not succeed
> > before all existing users let go of the guest_memfd pages for the
> > range being converted.
>
> Yes. Userspace will know that it needs to do that on the VCPU exit,
> which informs it of the guest's hypervisor request to unshare (convert
> from shared to private) the page.
>
> > In x86 CoCo usecases, userspace can also decide to not allow
> > conversion for scenarios where ranges are still under active use by
> > the host and guest is erroneously trying to take away memory. Both
> > SNP/TDX spec allow failure of conversion due to in use memory.
>
> How can the guest erroneously try to take away memory? If the guest
> sends a hypervisor request asking for a conversion of memory that
> doesn't belong to it, then I would expect the hypervisor to prevent
> that.

Making a range as private is effectively disallowing host from
accessing those ranges -> so taking away memory.

>
> I don't see how having an IOCTL to trigger the conversion is needed to
> allow conversion failure. How is that different from userspace
> ignoring or delaying releasing all references it has for the
> conversion request?
>
> > >
> > > This series isn't using the same ioctl, it's introducing new ones to
> > > perform a task that as far as I can tell so far, KVM can handle by
> > > itself.
> >
> > I would like to understand this better. How will KVM handle the
> > conversion process for guest_memfd pages? Can you help walk an example
> > sequence for shared to private conversion specifically around
> > guest_memfd offset states?
>
> To make sure that we are discussing the same scenario: can you do the
> same as well please --- walk me through an example sequence for shared
> to private conversion specifically around guest_memfd offset states
> With the IOCTLs involved?
>
> Here is an example that I have implemented and tested with pKVM. Note
> that there are alternatives, the flow below is architecture or even
> vm-type dependent. None of this code is code KVM code and the
> behaviour could vary.
>
>
> Assuming the folio is shared with the host:
>
> Guest sends unshare hypercall to the hypervisor
> Hypervisor forwards request to KVM (gmem) (having done due diligence)
> KVM (gmem) performs an unmap_folio(), exits to userspace with

For x86 CoCo VM usecases I was talking about, userspace would like to
avoid unmap_mapping_range() on the range before it's safe to unshare
the range.

> KVM_EXIT_UNSHARE and all the information about the folio being
> unshared
>
> Case 1:
> Userspace removes any remaining references (GUPs, IOMMU Mappings etc...)
> Userspace calls vcpu_run(): KVM (gmem) sees that there aren't any
> references, sets state to PRIVATE
>
> Case 2 (alternative 1):
> Userspace doesn't release its references
> Userspace calls vcpu_run(): KVM (gmem) sees that there are still
> references, exits back to userspace with KVM_EXIT_UNSHARE
>
> Case 2 (alternative 2):
> Userspace doesn't release its references
> Userspace calls vcpu_run(): KVM (gmem) sees that there are still
> references, unmaps folio from guest, but allows it to run (until it
> tries to fault in the folio)
> Guest tries to fault in folio that still has reference, KVM does not
> allow that (it sees that the folio is shared, and it doesn't fault in
> shared folios to confidential guests)
> KVM exits back to userspace with KVM_EXIT_UNSHARE
>
> As I mentioned, the alternatives above are _not_ set in core KVM code.
> They can vary by architecture of VM type, depending on the policy,
> support, etc..
>
> Now for your example please on how this would work with IOCTLs :)
>
> Thanks,
> /fuad
>
> > >
> > > >  - Allows not having to keep track of separate shared/private range
> > > > information in KVM.
> > >
> > > This patch series is already tracking shared/private range information in KVM.
> > >
> > > >  - Simpler handling of the conversion process done per guest_memfd
> > > > rather than for full range.
> > > >      - Userspace can handle the rollback as needed, simplifying error
> > > > handling in guest_memfd.
> > > >  - guest_memfd is single source of truth and notifies the users of
> > > > shareability change.
> > > >      - e.g. IOMMU, userspace, KVM MMU all can be registered for
> > > > getting notifications from guest_memfd directly and will get notified
> > > > for invalidation upon shareability attribute updates.
> > >
> > > All of these can still be done without introducing a new ioctl.
> > >
> > > Cheers,
> > > /fuad
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Fuad Tabba 7 months ago
Hi Vishal,

On Tue, 20 May 2025 at 17:03, Vishal Annapurve <vannapurve@google.com> wrote:
>
> On Tue, May 20, 2025 at 7:34 AM Fuad Tabba <tabba@google.com> wrote:
> >
> > Hi Vishal,
> >
> > On Tue, 20 May 2025 at 15:11, Vishal Annapurve <vannapurve@google.com> wrote:
> > >
> > > On Tue, May 20, 2025 at 6:44 AM Fuad Tabba <tabba@google.com> wrote:
> > > >
> > > > Hi Vishal,
> > > >
> > > > On Tue, 20 May 2025 at 14:02, Vishal Annapurve <vannapurve@google.com> wrote:
> > > > >
> > > > > On Tue, May 20, 2025 at 2:23 AM Fuad Tabba <tabba@google.com> wrote:
> > > > > >
> > > > > > Hi Ackerley,
> > > > > >
> > > > > > On Thu, 15 May 2025 at 00:43, Ackerley Tng <ackerleytng@google.com> wrote:
> > > > > > >
> > > > > > > The two new guest_memfd ioctls KVM_GMEM_CONVERT_SHARED and
> > > > > > > KVM_GMEM_CONVERT_PRIVATE convert the requested memory ranges to shared
> > > > > > > and private respectively.
> > > > > >
> > > > > > I have a high level question about this particular patch and this
> > > > > > approach for conversion: why do we need IOCTLs to manage conversion
> > > > > > between private and shared?
> > > > > >
> > > > > > In the presentations I gave at LPC [1, 2], and in my latest patch
> > > > > > series that performs in-place conversion [3] and the associated (by
> > > > > > now outdated) state diagram [4], I didn't see the need to have a
> > > > > > userspace-facing interface to manage that. KVM has all the information
> > > > > > it needs to handle conversions, which are triggered by the guest. To
> > > > > > me this seems like it adds additional complexity, as well as a user
> > > > > > facing interface that we would need to maintain.
> > > > > >
> > > > > > There are various ways we could handle conversion without explicit
> > > > > > interference from userspace. What I had in mind is the following (as
> > > > > > an example, details can vary according to VM type). I will use use the
> > > > > > case of conversion from shared to private because that is the more
> > > > > > complicated (interesting) case:
> > > > > >
> > > > > > - Guest issues a hypercall to request that a shared folio become private.
> > > > > >
> > > > > > - The hypervisor receives the call, and passes it to KVM.
> > > > > >
> > > > > > - KVM unmaps the folio from the guest stage-2 (EPT I think in x86
> > > > > > parlance), and unmaps it from the host. The host however, could still
> > > > > > have references (e.g., GUP).
> > > > > >
> > > > > > - KVM exits to the host (hypervisor call exit), with the information
> > > > > > that the folio has been unshared from it.
> > > > > >
> > > > > > - A well behaving host would now get rid of all of its references
> > > > > > (e.g., release GUPs), perform a VCPU run, and the guest continues
> > > > > > running as normal. I expect this to be the common case.
> > > > > >
> > > > > > But to handle the more interesting situation, let's say that the host
> > > > > > doesn't do it immediately, and for some reason it holds on to some
> > > > > > references to that folio.
> > > > > >
> > > > > > - Even if that's the case, the guest can still run *. If the guest
> > > > > > tries to access the folio, KVM detects that access when it tries to
> > > > > > fault it into the guest, sees that the host still has references to
> > > > > > that folio, and exits back to the host with a memory fault exit. At
> > > > > > this point, the VCPU that has tried to fault in that particular folio
> > > > > > cannot continue running as long as it cannot fault in that folio.
> > > > >
> > > > > Are you talking about the following scheme?
> > > > > 1) guest_memfd checks shareability on each get pfn and if there is a
> > > > > mismatch exit to the host.
> > > >
> > > > I think we are not really on the same page here (no pun intended :) ).
> > > > I'll try to answer your questions anyway...
> > > >
> > > > Which get_pfn? Are you referring to get_pfn when faulting the page
> > > > into the guest or into the host?
> > >
> > > I am referring to guest fault handling in KVM.
> > >
> > > >
> > > > > 2) host user space has to guess whether it's a pending refcount or
> > > > > whether it's an actual mismatch.
> > > >
> > > > No need to guess. VCPU run will let it know exactly why it's exiting.
> > > >
> > > > > 3) guest_memfd will maintain a third state
> > > > > "pending_private_conversion" or equivalent which will transition to
> > > > > private upon the last refcount drop of each page.
> > > > >
> > > > > If conversion is triggered by userspace (in case of pKVM, it will be
> > > > > triggered from within the KVM (?)):
> > > >
> > > > Why would conversion be triggered by userspace? As far as I know, it's
> > > > the guest that triggers the conversion.
> > > >
> > > > > * Conversion will just fail if there are extra refcounts and userspace
> > > > > can try to get rid of extra refcounts on the range while it has enough
> > > > > context without hitting any ambiguity with memory fault exit.
> > > > > * guest_memfd will not have to deal with this extra state from 3 above
> > > > > and overall guest_memfd conversion handling becomes relatively
> > > > > simpler.
> > > >
> > > > That's not really related. The extra state isn't necessary any more
> > > > once we agreed in the previous discussion that we will retry instead.
> > >
> > > Who is *we* here? Which entity will retry conversion?
> >
> > Userspace will re-attempt the VCPU run.
>
> Then KVM will have to keep track of the ranges that need conversion
> across exits. I think it's cleaner to let userspace make the decision
> and invoke conversion without carrying additional state in KVM about
> guest request.

I disagree. I think it's cleaner not to introduce a user interface,
and just to track the reason for the last exit, along with the
required additional data. KVM is responsible already for handling the
workflow, why delegate this last part to the VMM?

> >
> > > >
> > > > > Note that for x86 CoCo cases, memory conversion is already triggered
> > > > > by userspace using KVM ioctl, this series is proposing to use
> > > > > guest_memfd ioctl to do the same.
> > > >
> > > > The reason why for x86 CoCo cases conversion is already triggered by
> > > > userspace using KVM ioctl is that it has to, since shared memory and
> > > > private memory are two separate pages, and userspace needs to manage
> > > > that. Sharing memory in place removes the need for that.
> > >
> > > Userspace still needs to clean up memory usage before conversion is
> > > successful. e.g. remove IOMMU mappings for shared to private
> > > conversion. I would think that memory conversion should not succeed
> > > before all existing users let go of the guest_memfd pages for the
> > > range being converted.
> >
> > Yes. Userspace will know that it needs to do that on the VCPU exit,
> > which informs it of the guest's hypervisor request to unshare (convert
> > from shared to private) the page.
> >
> > > In x86 CoCo usecases, userspace can also decide to not allow
> > > conversion for scenarios where ranges are still under active use by
> > > the host and guest is erroneously trying to take away memory. Both
> > > SNP/TDX spec allow failure of conversion due to in use memory.
> >
> > How can the guest erroneously try to take away memory? If the guest
> > sends a hypervisor request asking for a conversion of memory that
> > doesn't belong to it, then I would expect the hypervisor to prevent
> > that.
>
> Making a range as private is effectively disallowing host from
> accessing those ranges -> so taking away memory.

You said "erroneously" earlier. My question is, how can the guest
*erroneously* try to take away memory? This is the normal flow of
guest/host relations. The memory is the guest's: it decides when to
share it with the host, and it can take it away.

> >
> > I don't see how having an IOCTL to trigger the conversion is needed to
> > allow conversion failure. How is that different from userspace
> > ignoring or delaying releasing all references it has for the
> > conversion request?
> >
> > > >
> > > > This series isn't using the same ioctl, it's introducing new ones to
> > > > perform a task that as far as I can tell so far, KVM can handle by
> > > > itself.
> > >
> > > I would like to understand this better. How will KVM handle the
> > > conversion process for guest_memfd pages? Can you help walk an example
> > > sequence for shared to private conversion specifically around
> > > guest_memfd offset states?
> >
> > To make sure that we are discussing the same scenario: can you do the
> > same as well please --- walk me through an example sequence for shared
> > to private conversion specifically around guest_memfd offset states
> > With the IOCTLs involved?
> >
> > Here is an example that I have implemented and tested with pKVM. Note
> > that there are alternatives, the flow below is architecture or even
> > vm-type dependent. None of this code is code KVM code and the
> > behaviour could vary.
> >
> >
> > Assuming the folio is shared with the host:
> >
> > Guest sends unshare hypercall to the hypervisor
> > Hypervisor forwards request to KVM (gmem) (having done due diligence)
> > KVM (gmem) performs an unmap_folio(), exits to userspace with
>
> For x86 CoCo VM usecases I was talking about, userspace would like to
> avoid unmap_mapping_range() on the range before it's safe to unshare
> the range.

Why? There is no harm in userspace unmapping before the memory isn't
shared. I don't see the problem with that.

You still haven't responded to my question from the previous email:
can you please return the favor and walk me through an example
sequence for shared to private conversion specifically around
guest_memfd offset states with the IOCTLs involved? :D

Thanks!
/fuad


> > KVM_EXIT_UNSHARE and all the information about the folio being
> > unshared
> >
> > Case 1:
> > Userspace removes any remaining references (GUPs, IOMMU Mappings etc...)
> > Userspace calls vcpu_run(): KVM (gmem) sees that there aren't any
> > references, sets state to PRIVATE
> >
> > Case 2 (alternative 1):
> > Userspace doesn't release its references
> > Userspace calls vcpu_run(): KVM (gmem) sees that there are still
> > references, exits back to userspace with KVM_EXIT_UNSHARE
> >
> > Case 2 (alternative 2):
> > Userspace doesn't release its references
> > Userspace calls vcpu_run(): KVM (gmem) sees that there are still
> > references, unmaps folio from guest, but allows it to run (until it
> > tries to fault in the folio)
> > Guest tries to fault in folio that still has reference, KVM does not
> > allow that (it sees that the folio is shared, and it doesn't fault in
> > shared folios to confidential guests)
> > KVM exits back to userspace with KVM_EXIT_UNSHARE
> >
> > As I mentioned, the alternatives above are _not_ set in core KVM code.
> > They can vary by architecture of VM type, depending on the policy,
> > support, etc..
> >
> > Now for your example please on how this would work with IOCTLs :)
> >
> > Thanks,
> > /fuad
> >
> > > >
> > > > >  - Allows not having to keep track of separate shared/private range
> > > > > information in KVM.
> > > >
> > > > This patch series is already tracking shared/private range information in KVM.
> > > >
> > > > >  - Simpler handling of the conversion process done per guest_memfd
> > > > > rather than for full range.
> > > > >      - Userspace can handle the rollback as needed, simplifying error
> > > > > handling in guest_memfd.
> > > > >  - guest_memfd is single source of truth and notifies the users of
> > > > > shareability change.
> > > > >      - e.g. IOMMU, userspace, KVM MMU all can be registered for
> > > > > getting notifications from guest_memfd directly and will get notified
> > > > > for invalidation upon shareability attribute updates.
> > > >
> > > > All of these can still be done without introducing a new ioctl.
> > > >
> > > > Cheers,
> > > > /fuad
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Ackerley Tng 7 months ago
Fuad Tabba <tabba@google.com> writes:

Let me try to bridge the gap here beginning with the flow we were
counting on for a shared to private conversion, for TDX:

1. Guest sends unshare hypercall to the hypervisor

2. (For x86 IIUC hypervisor is the same as KVM) KVM forwards the request
   to userspace via a KVM_EXIT_HYPERCALL, with KVM_HC_MAP_GPA_RANGE as
   the hypercall number.

   KVM also records that the guest wanted a shared to private
   conversion, the gpa and size of the request (no change from now, KVM
   already records that information in struct kvm_run) [1]

3. Userspace will do necessary coordination in userspace, then call the
   conversion ioctl, passing the parameters along to the ioctl.

4. Ioctl goes to guest_memfd, guest_memfd unmaps the pages, checks
   refcounts. If there's anything unexpected, error out to userspace. If
   all is well, flip shareability, exit to userspace with success.

5. Userspace calls vcpu_run() again, the handler for
   KVM_HC_MAP_GPA_RANGE will tell the guest that userspace was able to
   fulfill guest request with hypercall.ret set to 0 and then the guest
   will continue.

6. On the next fault guest_memfd will allow the private fault from the
   guest.


The flow you're proposing works too, with some changes, but it's
probably okay for x86 to have a slightly different flow anyway: (I
refactored the steps you outlined)

> 1. Guest sends unshare hypercall to the hypervisor

Same

> 2. Hypervisor forwards request to KVM (gmem) (having done due diligence)

For x86 IIUC hypervisor is the same as KVM, so there's no forwarding to KVM.

> 3. KVM (gmem) performs an unmap_folio(), exits to userspace with
>    KVM_EXIT_UNSHARE and all the information about the folio being unshared

The KVM_EXIT_UNSHARE here would correspond to x86's
KVM_HC_MAP_GPA_RANGE.

Unmapping before exiting with KVM_EXIT_UNSHARE here might be a little
premature since userspace may have to do some stuff before permitting
the conversion. For example, the memory may be mapped into another
userspace driver process, which needs to first be stopped.

But no issue though, as long as we don't flip shareability, if the host
uses the memory, the kvm_gmem_fault_shared() will just happen again,
nullifying the unmapping.

We could just shift the unmapping till after vcpu_run() is called
again.

> 4. Userspace will do necessary coordination in userspace, then do
>    vcpu_run()

There's another layer here, at least for x86, as to whether the
coordination was successful. For x86's KVM_HC_MAP_GPA_RANGE, userspace
can indicate a non-zero hypercall.ret for error.

For unsuccessful coordinations, userspace sets hypercall.ret to error
and the vcpu_run() handler doesn't try the conversion. Guest is informed
of hypercall error and guest will figure it out.

> 5. Successful coordination, case 1: vcpu_run() knows the last exit was
>    KVM_EXIT_UNSHARE and will set state to PRIVATE

For case 1, userspace will set hypercall.ret == 0, guest_memfd will do
the conversion, basically calling the same function that the ioctl calls
within guest_memfd.

> 5. Successful coordination, case 2, alternative 1: vcpu_run() knows
>    the last exit was KVM_EXIT_UNSHARE

Exit to userspace with KVM_EXIT_MEMORY_FAULT.

> 5. Successful coordination, case 2, alternative 2: vcpu_run() knows
>    the last exit was KVM_EXIT_UNSHARE

Forward hypercall.ret == 0 to the guest. Since the conversion was not
performed, the next fault will be mismatched and there will be a
KVM_EXIT_MEMORY_FAULT.

> Hi Vishal,
>
> On Tue, 20 May 2025 at 17:03, Vishal Annapurve <vannapurve@google.com> wrote:
>>
>> On Tue, May 20, 2025 at 7:34 AM Fuad Tabba <tabba@google.com> wrote:
>> >
>> > Hi Vishal,
>> >
>> > On Tue, 20 May 2025 at 15:11, Vishal Annapurve <vannapurve@google.com> wrote:
>> > >
>> > > On Tue, May 20, 2025 at 6:44 AM Fuad Tabba <tabba@google.com> wrote:
>> > > >
>> > > > Hi Vishal,
>> > > >
>> > > > On Tue, 20 May 2025 at 14:02, Vishal Annapurve <vannapurve@google.com> wrote:
>> > > > >
>> > > > > On Tue, May 20, 2025 at 2:23 AM Fuad Tabba <tabba@google.com> wrote:
>> > > > > >
>> > > > > > Hi Ackerley,
>> > > > > >
>> > > > > > On Thu, 15 May 2025 at 00:43, Ackerley Tng <ackerleytng@google.com> wrote:
>> > > > > > >
>> > > > > > > The two new guest_memfd ioctls KVM_GMEM_CONVERT_SHARED and
>> > > > > > > KVM_GMEM_CONVERT_PRIVATE convert the requested memory ranges to shared
>> > > > > > > and private respectively.
>> > > > > >
>> > > > > > I have a high level question about this particular patch and this
>> > > > > > approach for conversion: why do we need IOCTLs to manage conversion
>> > > > > > between private and shared?
>> > > > > >
>> > > > > > In the presentations I gave at LPC [1, 2], and in my latest patch
>> > > > > > series that performs in-place conversion [3] and the associated (by
>> > > > > > now outdated) state diagram [4], I didn't see the need to have a
>> > > > > > userspace-facing interface to manage that. KVM has all the information
>> > > > > > it needs to handle conversions, which are triggered by the guest. To
>> > > > > > me this seems like it adds additional complexity, as well as a user
>> > > > > > facing interface that we would need to maintain.
>> > > > > >
>> > > > > > There are various ways we could handle conversion without explicit
>> > > > > > interference from userspace. What I had in mind is the following (as
>> > > > > > an example, details can vary according to VM type). I will use use the
>> > > > > > case of conversion from shared to private because that is the more
>> > > > > > complicated (interesting) case:
>> > > > > >
>> > > > > > - Guest issues a hypercall to request that a shared folio become private.
>> > > > > >
>> > > > > > - The hypervisor receives the call, and passes it to KVM.
>> > > > > >
>> > > > > > - KVM unmaps the folio from the guest stage-2 (EPT I think in x86
>> > > > > > parlance), and unmaps it from the host. The host however, could still
>> > > > > > have references (e.g., GUP).
>> > > > > >
>> > > > > > - KVM exits to the host (hypervisor call exit), with the information
>> > > > > > that the folio has been unshared from it.
>> > > > > >
>> > > > > > - A well behaving host would now get rid of all of its references
>> > > > > > (e.g., release GUPs), perform a VCPU run, and the guest continues
>> > > > > > running as normal. I expect this to be the common case.
>> > > > > >
>> > > > > > But to handle the more interesting situation, let's say that the host
>> > > > > > doesn't do it immediately, and for some reason it holds on to some
>> > > > > > references to that folio.
>> > > > > >
>> > > > > > - Even if that's the case, the guest can still run *. If the guest
>> > > > > > tries to access the folio, KVM detects that access when it tries to
>> > > > > > fault it into the guest, sees that the host still has references to
>> > > > > > that folio, and exits back to the host with a memory fault exit. At
>> > > > > > this point, the VCPU that has tried to fault in that particular folio
>> > > > > > cannot continue running as long as it cannot fault in that folio.
>> > > > >
>> > > > > Are you talking about the following scheme?
>> > > > > 1) guest_memfd checks shareability on each get pfn and if there is a
>> > > > > mismatch exit to the host.
>> > > >
>> > > > I think we are not really on the same page here (no pun intended :) ).
>> > > > I'll try to answer your questions anyway...
>> > > >
>> > > > Which get_pfn? Are you referring to get_pfn when faulting the page
>> > > > into the guest or into the host?
>> > >
>> > > I am referring to guest fault handling in KVM.
>> > >
>> > > >
>> > > > > 2) host user space has to guess whether it's a pending refcount or
>> > > > > whether it's an actual mismatch.
>> > > >
>> > > > No need to guess. VCPU run will let it know exactly why it's exiting.
>> > > >
>> > > > > 3) guest_memfd will maintain a third state
>> > > > > "pending_private_conversion" or equivalent which will transition to
>> > > > > private upon the last refcount drop of each page.
>> > > > >
>> > > > > If conversion is triggered by userspace (in case of pKVM, it will be
>> > > > > triggered from within the KVM (?)):
>> > > >
>> > > > Why would conversion be triggered by userspace? As far as I know, it's
>> > > > the guest that triggers the conversion.
>> > > >
>> > > > > * Conversion will just fail if there are extra refcounts and userspace
>> > > > > can try to get rid of extra refcounts on the range while it has enough
>> > > > > context without hitting any ambiguity with memory fault exit.
>> > > > > * guest_memfd will not have to deal with this extra state from 3 above
>> > > > > and overall guest_memfd conversion handling becomes relatively
>> > > > > simpler.
>> > > >
>> > > > That's not really related. The extra state isn't necessary any more
>> > > > once we agreed in the previous discussion that we will retry instead.
>> > >
>> > > Who is *we* here? Which entity will retry conversion?
>> >
>> > Userspace will re-attempt the VCPU run.
>>
>> Then KVM will have to keep track of the ranges that need conversion
>> across exits. I think it's cleaner to let userspace make the decision
>> and invoke conversion without carrying additional state in KVM about
>> guest request.
>
> I disagree. I think it's cleaner not to introduce a user interface,
> and just to track the reason for the last exit, along with the
> required additional data. KVM is responsible already for handling the
> workflow, why delegate this last part to the VMM?
>

I believe Fuad's concern is the complexity of adding and maintaining
another ioctl, as opposed to having vcpu_run() do the conversions.

I think the two options are basically the same in that both are actually
adding some form of user contract, just in different places.

For the ioctl approach, in this RFCv2 I added a error_offset field so
that userspace has a hint of where the conversion had an issue. the
ioctl also returns errors to indicate what went wrong, like -EINVAL or
-ENOMEM if perhaps splitting the page required memory and there wasn't
any, or the kernel ran out of memory trying to update mappability.

If we want to provide the same level of error information for the
vcpu_run() approach, we should probably add error_offset to
KVM_EXIT_MEMORY_FAULT so that on a conversion failure we could re-exit
to userspace with more information about the error_offset.


So what we're really comparing is two ways to perform the conversion (1)
via a direct ioctl and (2) via vcpu_run().

I think having a direct ioctl is cleaner because it doesn't involve
vCPUs for a memory operation.

Conceptually, the conversion is a memory operation belonging to memory
in the guest_memfd. Hence, the conversion operation is better addressed
directly to the memory via a direct ioctl.

For this same reason, we didn't want to do the conversion via the
KVM_SET_MEMORY_ATTRIBUTES ioctl. KVM_SET_MEMORY_ATTRIBUTES is an
operation for KVM's view of guest_memfd, which is linked to but not
directly the same as a memory operation.

By having a direct ioctl over using KVM_SET_MEMORY_ATTRIBUTES, we avoid
having a dependency where memslots must first be bound to guest_memfd
for the conversion to work.

When rebooting, the memslots may not yet be bound to the guest_memfd,
but we want to reset the guest_memfd's to private. If we use
KVM_SET_MEMORY_ATTRIBUTES to convert, we'd be forced to first bind, then
convert. If we had a direct ioctl, we don't have this restriction.

If we do the conversion via vcpu_run() we would be forced to handle
conversions only with a vcpu_run() and only the guest can initiate a
conversion.

On a guest boot for TDX, the memory is assumed to be private. If the we
gave it memory set as shared, we'd just have a bunch of
KVM_EXIT_MEMORY_FAULTs that slow down boot. Hence on a guest reboot, we
will want to reset the guest memory to private.

We could say the firmware should reset memory to private on guest
reboot, but we can't force all guests to update firmware.

>> >
>> > > >
>> > > > > Note that for x86 CoCo cases, memory conversion is already triggered
>> > > > > by userspace using KVM ioctl, this series is proposing to use
>> > > > > guest_memfd ioctl to do the same.
>> > > >
>> > > > The reason why for x86 CoCo cases conversion is already triggered by
>> > > > userspace using KVM ioctl is that it has to, since shared memory and
>> > > > private memory are two separate pages, and userspace needs to manage
>> > > > that. Sharing memory in place removes the need for that.
>> > >
>> > > Userspace still needs to clean up memory usage before conversion is
>> > > successful. e.g. remove IOMMU mappings for shared to private
>> > > conversion. I would think that memory conversion should not succeed
>> > > before all existing users let go of the guest_memfd pages for the
>> > > range being converted.
>> >
>> > Yes. Userspace will know that it needs to do that on the VCPU exit,
>> > which informs it of the guest's hypervisor request to unshare (convert
>> > from shared to private) the page.
>> >
>> > > In x86 CoCo usecases, userspace can also decide to not allow
>> > > conversion for scenarios where ranges are still under active use by
>> > > the host and guest is erroneously trying to take away memory. Both
>> > > SNP/TDX spec allow failure of conversion due to in use memory.
>> >
>> > How can the guest erroneously try to take away memory? If the guest
>> > sends a hypervisor request asking for a conversion of memory that
>> > doesn't belong to it, then I would expect the hypervisor to prevent
>> > that.
>>
>> Making a range as private is effectively disallowing host from
>> accessing those ranges -> so taking away memory.
>
> You said "erroneously" earlier. My question is, how can the guest
> *erroneously* try to take away memory? This is the normal flow of
> guest/host relations. The memory is the guest's: it decides when to
> share it with the host, and it can take it away.
>

See above, it's not really erroneous as long as we
kvm_gmem_fault_shared() can still happen, since after unmapping, any
host access will just fault the page again.

>> >
>> > I don't see how having an IOCTL to trigger the conversion is needed to
>> > allow conversion failure. How is that different from userspace
>> > ignoring or delaying releasing all references it has for the
>> > conversion request?
>> >
>> > > >
>> > > > This series isn't using the same ioctl, it's introducing new ones to
>> > > > perform a task that as far as I can tell so far, KVM can handle by
>> > > > itself.
>> > >
>> > > I would like to understand this better. How will KVM handle the
>> > > conversion process for guest_memfd pages? Can you help walk an example
>> > > sequence for shared to private conversion specifically around
>> > > guest_memfd offset states?
>> >
>> > To make sure that we are discussing the same scenario: can you do the
>> > same as well please --- walk me through an example sequence for shared
>> > to private conversion specifically around guest_memfd offset states
>> > With the IOCTLs involved?
>> >
>> > Here is an example that I have implemented and tested with pKVM. Note
>> > that there are alternatives, the flow below is architecture or even
>> > vm-type dependent. None of this code is code KVM code and the
>> > behaviour could vary.
>> >
>> >
>> > Assuming the folio is shared with the host:
>> >
>> > Guest sends unshare hypercall to the hypervisor
>> > Hypervisor forwards request to KVM (gmem) (having done due diligence)
>> > KVM (gmem) performs an unmap_folio(), exits to userspace with
>>
>> For x86 CoCo VM usecases I was talking about, userspace would like to
>> avoid unmap_mapping_range() on the range before it's safe to unshare
>> the range.
>
> Why? There is no harm in userspace unmapping before the memory isn't
> shared. I don't see the problem with that.
>

Yes, no harm done, just possible remapping after unmapping.

> You still haven't responded to my question from the previous email:
> can you please return the favor and walk me through an example
> sequence for shared to private conversion specifically around
> guest_memfd offset states with the IOCTLs involved? :D
>

Right at the top :)

> Thanks!
> /fuad
>
>
>> > KVM_EXIT_UNSHARE and all the information about the folio being
>> > unshared
>> >
>> > Case 1:
>> > Userspace removes any remaining references (GUPs, IOMMU Mappings etc...)
>> > Userspace calls vcpu_run(): KVM (gmem) sees that there aren't any
>> > references, sets state to PRIVATE
>> >
>> > Case 2 (alternative 1):
>> > Userspace doesn't release its references
>> > Userspace calls vcpu_run(): KVM (gmem) sees that there are still
>> > references, exits back to userspace with KVM_EXIT_UNSHARE
>> >
>> > Case 2 (alternative 2):
>> > Userspace doesn't release its references
>> > Userspace calls vcpu_run(): KVM (gmem) sees that there are still
>> > references, unmaps folio from guest, but allows it to run (until it
>> > tries to fault in the folio)
>> > Guest tries to fault in folio that still has reference, KVM does not
>> > allow that (it sees that the folio is shared, and it doesn't fault in
>> > shared folios to confidential guests)
>> > KVM exits back to userspace with KVM_EXIT_UNSHARE
>> >
>> > As I mentioned, the alternatives above are _not_ set in core KVM code.
>> > They can vary by architecture of VM type, depending on the policy,
>> > support, etc..
>> >
>> > Now for your example please on how this would work with IOCTLs :)
>> >
>> > Thanks,
>> > /fuad
>> >
>> > > >
>> > > > >  - Allows not having to keep track of separate shared/private range
>> > > > > information in KVM.
>> > > >
>> > > > This patch series is already tracking shared/private range information in KVM.
>> > > >
>> > > > >  - Simpler handling of the conversion process done per guest_memfd
>> > > > > rather than for full range.
>> > > > >      - Userspace can handle the rollback as needed, simplifying error
>> > > > > handling in guest_memfd.
>> > > > >  - guest_memfd is single source of truth and notifies the users of
>> > > > > shareability change.
>> > > > >      - e.g. IOMMU, userspace, KVM MMU all can be registered for
>> > > > > getting notifications from guest_memfd directly and will get notified
>> > > > > for invalidation upon shareability attribute updates.
>> > > >
>> > > > All of these can still be done without introducing a new ioctl.
>> > > >
>> > > > Cheers,
>> > > > /fuad
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Fuad Tabba 7 months ago
Hi Ackerley,

On Tue, 20 May 2025 at 20:40, Ackerley Tng <ackerleytng@google.com> wrote:
>
> Fuad Tabba <tabba@google.com> writes:
>
> Let me try to bridge the gap here beginning with the flow we were
> counting on for a shared to private conversion, for TDX:
>
> 1. Guest sends unshare hypercall to the hypervisor
>
> 2. (For x86 IIUC hypervisor is the same as KVM) KVM forwards the request
>    to userspace via a KVM_EXIT_HYPERCALL, with KVM_HC_MAP_GPA_RANGE as
>    the hypercall number.
>
>    KVM also records that the guest wanted a shared to private
>    conversion, the gpa and size of the request (no change from now, KVM
>    already records that information in struct kvm_run) [1]
>
> 3. Userspace will do necessary coordination in userspace, then call the
>    conversion ioctl, passing the parameters along to the ioctl.
>
> 4. Ioctl goes to guest_memfd, guest_memfd unmaps the pages, checks
>    refcounts. If there's anything unexpected, error out to userspace. If
>    all is well, flip shareability, exit to userspace with success.
>
> 5. Userspace calls vcpu_run() again, the handler for
>    KVM_HC_MAP_GPA_RANGE will tell the guest that userspace was able to
>    fulfill guest request with hypercall.ret set to 0 and then the guest
>    will continue.
>
> 6. On the next fault guest_memfd will allow the private fault from the
>    guest.
>
>
> The flow you're proposing works too, with some changes, but it's
> probably okay for x86 to have a slightly different flow anyway: (I
> refactored the steps you outlined)
>
> > 1. Guest sends unshare hypercall to the hypervisor
>
> Same
>
> > 2. Hypervisor forwards request to KVM (gmem) (having done due diligence)
>
> For x86 IIUC hypervisor is the same as KVM, so there's no forwarding to KVM.
>
> > 3. KVM (gmem) performs an unmap_folio(), exits to userspace with
> >    KVM_EXIT_UNSHARE and all the information about the folio being unshared
>
> The KVM_EXIT_UNSHARE here would correspond to x86's
> KVM_HC_MAP_GPA_RANGE.
>
> Unmapping before exiting with KVM_EXIT_UNSHARE here might be a little
> premature since userspace may have to do some stuff before permitting
> the conversion. For example, the memory may be mapped into another
> userspace driver process, which needs to first be stopped.
>
> But no issue though, as long as we don't flip shareability, if the host
> uses the memory, the kvm_gmem_fault_shared() will just happen again,
> nullifying the unmapping.
>
> We could just shift the unmapping till after vcpu_run() is called
> again.
>
> > 4. Userspace will do necessary coordination in userspace, then do
> >    vcpu_run()
>
> There's another layer here, at least for x86, as to whether the
> coordination was successful. For x86's KVM_HC_MAP_GPA_RANGE, userspace
> can indicate a non-zero hypercall.ret for error.
>
> For unsuccessful coordinations, userspace sets hypercall.ret to error
> and the vcpu_run() handler doesn't try the conversion. Guest is informed
> of hypercall error and guest will figure it out.
>
> > 5. Successful coordination, case 1: vcpu_run() knows the last exit was
> >    KVM_EXIT_UNSHARE and will set state to PRIVATE
>
> For case 1, userspace will set hypercall.ret == 0, guest_memfd will do
> the conversion, basically calling the same function that the ioctl calls
> within guest_memfd.
>
> > 5. Successful coordination, case 2, alternative 1: vcpu_run() knows
> >    the last exit was KVM_EXIT_UNSHARE
>
> Exit to userspace with KVM_EXIT_MEMORY_FAULT.
>
> > 5. Successful coordination, case 2, alternative 2: vcpu_run() knows
> >    the last exit was KVM_EXIT_UNSHARE
>
> Forward hypercall.ret == 0 to the guest. Since the conversion was not
> performed, the next fault will be mismatched and there will be a
> KVM_EXIT_MEMORY_FAULT.

So far so good. With regard to the flow, in the code that I had, all
the specific details were arm64, and even pKVM specific. None of it
was baked into core KVM code, since of course, different
architectures, and even different VM types, will vary significantly.
Arm CCA for example is closer to TDX than it is to pKVM. Moreover, it
was just a hack at getting something reasonable that works, as a proof
of concept.

This is one of the reasons I'm not a fan of having a userspace IOCTL
as an additional required step as part of this protocol. KVM exits
already exist (*), and we need them anyway here. The flow above is
VM-type specific, and since much of it isn't exposed to the user: it's
easy (and likely) to change. Having an IOCTL and adding another step
in the process makes it more difficult to change things later.

(*) Try saying that ten times fast! Note: first word is exit, second
word is exist :)

> > Hi Vishal,
> >
> > On Tue, 20 May 2025 at 17:03, Vishal Annapurve <vannapurve@google.com> wrote:
> >>
> >> On Tue, May 20, 2025 at 7:34 AM Fuad Tabba <tabba@google.com> wrote:
> >> >
> >> > Hi Vishal,
> >> >
> >> > On Tue, 20 May 2025 at 15:11, Vishal Annapurve <vannapurve@google.com> wrote:
> >> > >
> >> > > On Tue, May 20, 2025 at 6:44 AM Fuad Tabba <tabba@google.com> wrote:
> >> > > >
> >> > > > Hi Vishal,
> >> > > >
> >> > > > On Tue, 20 May 2025 at 14:02, Vishal Annapurve <vannapurve@google.com> wrote:
> >> > > > >
> >> > > > > On Tue, May 20, 2025 at 2:23 AM Fuad Tabba <tabba@google.com> wrote:
> >> > > > > >
> >> > > > > > Hi Ackerley,
> >> > > > > >
> >> > > > > > On Thu, 15 May 2025 at 00:43, Ackerley Tng <ackerleytng@google.com> wrote:
> >> > > > > > >
> >> > > > > > > The two new guest_memfd ioctls KVM_GMEM_CONVERT_SHARED and
> >> > > > > > > KVM_GMEM_CONVERT_PRIVATE convert the requested memory ranges to shared
> >> > > > > > > and private respectively.
> >> > > > > >
> >> > > > > > I have a high level question about this particular patch and this
> >> > > > > > approach for conversion: why do we need IOCTLs to manage conversion
> >> > > > > > between private and shared?
> >> > > > > >
> >> > > > > > In the presentations I gave at LPC [1, 2], and in my latest patch
> >> > > > > > series that performs in-place conversion [3] and the associated (by
> >> > > > > > now outdated) state diagram [4], I didn't see the need to have a
> >> > > > > > userspace-facing interface to manage that. KVM has all the information
> >> > > > > > it needs to handle conversions, which are triggered by the guest. To
> >> > > > > > me this seems like it adds additional complexity, as well as a user
> >> > > > > > facing interface that we would need to maintain.
> >> > > > > >
> >> > > > > > There are various ways we could handle conversion without explicit
> >> > > > > > interference from userspace. What I had in mind is the following (as
> >> > > > > > an example, details can vary according to VM type). I will use use the
> >> > > > > > case of conversion from shared to private because that is the more
> >> > > > > > complicated (interesting) case:
> >> > > > > >
> >> > > > > > - Guest issues a hypercall to request that a shared folio become private.
> >> > > > > >
> >> > > > > > - The hypervisor receives the call, and passes it to KVM.
> >> > > > > >
> >> > > > > > - KVM unmaps the folio from the guest stage-2 (EPT I think in x86
> >> > > > > > parlance), and unmaps it from the host. The host however, could still
> >> > > > > > have references (e.g., GUP).
> >> > > > > >
> >> > > > > > - KVM exits to the host (hypervisor call exit), with the information
> >> > > > > > that the folio has been unshared from it.
> >> > > > > >
> >> > > > > > - A well behaving host would now get rid of all of its references
> >> > > > > > (e.g., release GUPs), perform a VCPU run, and the guest continues
> >> > > > > > running as normal. I expect this to be the common case.
> >> > > > > >
> >> > > > > > But to handle the more interesting situation, let's say that the host
> >> > > > > > doesn't do it immediately, and for some reason it holds on to some
> >> > > > > > references to that folio.
> >> > > > > >
> >> > > > > > - Even if that's the case, the guest can still run *. If the guest
> >> > > > > > tries to access the folio, KVM detects that access when it tries to
> >> > > > > > fault it into the guest, sees that the host still has references to
> >> > > > > > that folio, and exits back to the host with a memory fault exit. At
> >> > > > > > this point, the VCPU that has tried to fault in that particular folio
> >> > > > > > cannot continue running as long as it cannot fault in that folio.
> >> > > > >
> >> > > > > Are you talking about the following scheme?
> >> > > > > 1) guest_memfd checks shareability on each get pfn and if there is a
> >> > > > > mismatch exit to the host.
> >> > > >
> >> > > > I think we are not really on the same page here (no pun intended :) ).
> >> > > > I'll try to answer your questions anyway...
> >> > > >
> >> > > > Which get_pfn? Are you referring to get_pfn when faulting the page
> >> > > > into the guest or into the host?
> >> > >
> >> > > I am referring to guest fault handling in KVM.
> >> > >
> >> > > >
> >> > > > > 2) host user space has to guess whether it's a pending refcount or
> >> > > > > whether it's an actual mismatch.
> >> > > >
> >> > > > No need to guess. VCPU run will let it know exactly why it's exiting.
> >> > > >
> >> > > > > 3) guest_memfd will maintain a third state
> >> > > > > "pending_private_conversion" or equivalent which will transition to
> >> > > > > private upon the last refcount drop of each page.
> >> > > > >
> >> > > > > If conversion is triggered by userspace (in case of pKVM, it will be
> >> > > > > triggered from within the KVM (?)):
> >> > > >
> >> > > > Why would conversion be triggered by userspace? As far as I know, it's
> >> > > > the guest that triggers the conversion.
> >> > > >
> >> > > > > * Conversion will just fail if there are extra refcounts and userspace
> >> > > > > can try to get rid of extra refcounts on the range while it has enough
> >> > > > > context without hitting any ambiguity with memory fault exit.
> >> > > > > * guest_memfd will not have to deal with this extra state from 3 above
> >> > > > > and overall guest_memfd conversion handling becomes relatively
> >> > > > > simpler.
> >> > > >
> >> > > > That's not really related. The extra state isn't necessary any more
> >> > > > once we agreed in the previous discussion that we will retry instead.
> >> > >
> >> > > Who is *we* here? Which entity will retry conversion?
> >> >
> >> > Userspace will re-attempt the VCPU run.
> >>
> >> Then KVM will have to keep track of the ranges that need conversion
> >> across exits. I think it's cleaner to let userspace make the decision
> >> and invoke conversion without carrying additional state in KVM about
> >> guest request.
> >
> > I disagree. I think it's cleaner not to introduce a user interface,
> > and just to track the reason for the last exit, along with the
> > required additional data. KVM is responsible already for handling the
> > workflow, why delegate this last part to the VMM?
> >
>
> I believe Fuad's concern is the complexity of adding and maintaining
> another ioctl, as opposed to having vcpu_run() do the conversions.
>
> I think the two options are basically the same in that both are actually
> adding some form of user contract, just in different places.
>
> For the ioctl approach, in this RFCv2 I added a error_offset field so
> that userspace has a hint of where the conversion had an issue. the
> ioctl also returns errors to indicate what went wrong, like -EINVAL or
> -ENOMEM if perhaps splitting the page required memory and there wasn't
> any, or the kernel ran out of memory trying to update mappability.
>
> If we want to provide the same level of error information for the
> vcpu_run() approach, we should probably add error_offset to
> KVM_EXIT_MEMORY_FAULT so that on a conversion failure we could re-exit
> to userspace with more information about the error_offset.
>
>
> So what we're really comparing is two ways to perform the conversion (1)
> via a direct ioctl and (2) via vcpu_run().

That's exactly right.

> I think having a direct ioctl is cleaner because it doesn't involve
> vCPUs for a memory operation.
>
> Conceptually, the conversion is a memory operation belonging to memory
> in the guest_memfd. Hence, the conversion operation is better addressed
> directly to the memory via a direct ioctl.
>
> For this same reason, we didn't want to do the conversion via the
> KVM_SET_MEMORY_ATTRIBUTES ioctl. KVM_SET_MEMORY_ATTRIBUTES is an
> operation for KVM's view of guest_memfd, which is linked to but not
> directly the same as a memory operation.
>
> By having a direct ioctl over using KVM_SET_MEMORY_ATTRIBUTES, we avoid
> having a dependency where memslots must first be bound to guest_memfd
> for the conversion to work.
>
> When rebooting, the memslots may not yet be bound to the guest_memfd,
> but we want to reset the guest_memfd's to private. If we use
> KVM_SET_MEMORY_ATTRIBUTES to convert, we'd be forced to first bind, then
> convert. If we had a direct ioctl, we don't have this restriction.
>
> If we do the conversion via vcpu_run() we would be forced to handle
> conversions only with a vcpu_run() and only the guest can initiate a
> conversion.
>
> On a guest boot for TDX, the memory is assumed to be private. If the we
> gave it memory set as shared, we'd just have a bunch of
> KVM_EXIT_MEMORY_FAULTs that slow down boot. Hence on a guest reboot, we
> will want to reset the guest memory to private.
>
> We could say the firmware should reset memory to private on guest
> reboot, but we can't force all guests to update firmware.

Here is where I disagree. I do think that this is the CoCo guest's
responsibility (and by guest I include its firmware) to fix its own
state after a reboot. How would the host even know that a guest is
rebooting if it's a CoCo guest?

Either the host doesn't (or cannot even) know that the guest is
rebooting, in which case I don't see how having an IOCTL would help.
Or somehow the host does know that, i.e., via a hypercall that
indicates that. In which case, we could have it so that for that type
of VM, we would reconvert its pages to private on a reboot.

Additionally, we could introduce range operations for
sharing/unsharing, to avoid having to have an exit for every one.

> >> >
> >> > > >
> >> > > > > Note that for x86 CoCo cases, memory conversion is already triggered
> >> > > > > by userspace using KVM ioctl, this series is proposing to use
> >> > > > > guest_memfd ioctl to do the same.
> >> > > >
> >> > > > The reason why for x86 CoCo cases conversion is already triggered by
> >> > > > userspace using KVM ioctl is that it has to, since shared memory and
> >> > > > private memory are two separate pages, and userspace needs to manage
> >> > > > that. Sharing memory in place removes the need for that.
> >> > >
> >> > > Userspace still needs to clean up memory usage before conversion is
> >> > > successful. e.g. remove IOMMU mappings for shared to private
> >> > > conversion. I would think that memory conversion should not succeed
> >> > > before all existing users let go of the guest_memfd pages for the
> >> > > range being converted.
> >> >
> >> > Yes. Userspace will know that it needs to do that on the VCPU exit,
> >> > which informs it of the guest's hypervisor request to unshare (convert
> >> > from shared to private) the page.
> >> >
> >> > > In x86 CoCo usecases, userspace can also decide to not allow
> >> > > conversion for scenarios where ranges are still under active use by
> >> > > the host and guest is erroneously trying to take away memory. Both
> >> > > SNP/TDX spec allow failure of conversion due to in use memory.
> >> >
> >> > How can the guest erroneously try to take away memory? If the guest
> >> > sends a hypervisor request asking for a conversion of memory that
> >> > doesn't belong to it, then I would expect the hypervisor to prevent
> >> > that.
> >>
> >> Making a range as private is effectively disallowing host from
> >> accessing those ranges -> so taking away memory.
> >
> > You said "erroneously" earlier. My question is, how can the guest
> > *erroneously* try to take away memory? This is the normal flow of
> > guest/host relations. The memory is the guest's: it decides when to
> > share it with the host, and it can take it away.
> >
>
> See above, it's not really erroneous as long as we
> kvm_gmem_fault_shared() can still happen, since after unmapping, any
> host access will just fault the page again.

I was confused by the word "erroneous", as you would expect that for a
CoCo guest, the host wouldn't (or shouldn't) know the intention behind
a CoCo guest's access. I would expect that erroneous guest accesses
would be handled by the hypervisor. But I think we're on the same page
now.

> >> >
> >> > I don't see how having an IOCTL to trigger the conversion is needed to
> >> > allow conversion failure. How is that different from userspace
> >> > ignoring or delaying releasing all references it has for the
> >> > conversion request?
> >> >
> >> > > >
> >> > > > This series isn't using the same ioctl, it's introducing new ones to
> >> > > > perform a task that as far as I can tell so far, KVM can handle by
> >> > > > itself.
> >> > >
> >> > > I would like to understand this better. How will KVM handle the
> >> > > conversion process for guest_memfd pages? Can you help walk an example
> >> > > sequence for shared to private conversion specifically around
> >> > > guest_memfd offset states?
> >> >
> >> > To make sure that we are discussing the same scenario: can you do the
> >> > same as well please --- walk me through an example sequence for shared
> >> > to private conversion specifically around guest_memfd offset states
> >> > With the IOCTLs involved?
> >> >
> >> > Here is an example that I have implemented and tested with pKVM. Note
> >> > that there are alternatives, the flow below is architecture or even
> >> > vm-type dependent. None of this code is code KVM code and the
> >> > behaviour could vary.
> >> >
> >> >
> >> > Assuming the folio is shared with the host:
> >> >
> >> > Guest sends unshare hypercall to the hypervisor
> >> > Hypervisor forwards request to KVM (gmem) (having done due diligence)
> >> > KVM (gmem) performs an unmap_folio(), exits to userspace with
> >>
> >> For x86 CoCo VM usecases I was talking about, userspace would like to
> >> avoid unmap_mapping_range() on the range before it's safe to unshare
> >> the range.
> >
> > Why? There is no harm in userspace unmapping before the memory isn't
> > shared. I don't see the problem with that.
> >
>
> Yes, no harm done, just possible remapping after unmapping.
>
> > You still haven't responded to my question from the previous email:
> > can you please return the favor and walk me through an example
> > sequence for shared to private conversion specifically around
> > guest_memfd offset states with the IOCTLs involved? :D
> >
>
> Right at the top :)

Thank you Ackerley!

Cheers,
/fuad

>
> > Thanks!
> > /fuad
> >
> >
> >> > KVM_EXIT_UNSHARE and all the information about the folio being
> >> > unshared
> >> >
> >> > Case 1:
> >> > Userspace removes any remaining references (GUPs, IOMMU Mappings etc...)
> >> > Userspace calls vcpu_run(): KVM (gmem) sees that there aren't any
> >> > references, sets state to PRIVATE
> >> >
> >> > Case 2 (alternative 1):
> >> > Userspace doesn't release its references
> >> > Userspace calls vcpu_run(): KVM (gmem) sees that there are still
> >> > references, exits back to userspace with KVM_EXIT_UNSHARE
> >> >
> >> > Case 2 (alternative 2):
> >> > Userspace doesn't release its references
> >> > Userspace calls vcpu_run(): KVM (gmem) sees that there are still
> >> > references, unmaps folio from guest, but allows it to run (until it
> >> > tries to fault in the folio)
> >> > Guest tries to fault in folio that still has reference, KVM does not
> >> > allow that (it sees that the folio is shared, and it doesn't fault in
> >> > shared folios to confidential guests)
> >> > KVM exits back to userspace with KVM_EXIT_UNSHARE
> >> >
> >> > As I mentioned, the alternatives above are _not_ set in core KVM code.
> >> > They can vary by architecture of VM type, depending on the policy,
> >> > support, etc..
> >> >
> >> > Now for your example please on how this would work with IOCTLs :)
> >> >
> >> > Thanks,
> >> > /fuad
> >> >
> >> > > >
> >> > > > >  - Allows not having to keep track of separate shared/private range
> >> > > > > information in KVM.
> >> > > >
> >> > > > This patch series is already tracking shared/private range information in KVM.
> >> > > >
> >> > > > >  - Simpler handling of the conversion process done per guest_memfd
> >> > > > > rather than for full range.
> >> > > > >      - Userspace can handle the rollback as needed, simplifying error
> >> > > > > handling in guest_memfd.
> >> > > > >  - guest_memfd is single source of truth and notifies the users of
> >> > > > > shareability change.
> >> > > > >      - e.g. IOMMU, userspace, KVM MMU all can be registered for
> >> > > > > getting notifications from guest_memfd directly and will get notified
> >> > > > > for invalidation upon shareability attribute updates.
> >> > > >
> >> > > > All of these can still be done without introducing a new ioctl.
> >> > > >
> >> > > > Cheers,
> >> > > > /fuad
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Vishal Annapurve 7 months ago
On Wed, May 21, 2025 at 5:36 AM Fuad Tabba <tabba@google.com> wrote:
> ....
> > When rebooting, the memslots may not yet be bound to the guest_memfd,
> > but we want to reset the guest_memfd's to private. If we use
> > KVM_SET_MEMORY_ATTRIBUTES to convert, we'd be forced to first bind, then
> > convert. If we had a direct ioctl, we don't have this restriction.
> >
> > If we do the conversion via vcpu_run() we would be forced to handle
> > conversions only with a vcpu_run() and only the guest can initiate a
> > conversion.
> >
> > On a guest boot for TDX, the memory is assumed to be private. If the we
> > gave it memory set as shared, we'd just have a bunch of
> > KVM_EXIT_MEMORY_FAULTs that slow down boot. Hence on a guest reboot, we
> > will want to reset the guest memory to private.
> >
> > We could say the firmware should reset memory to private on guest
> > reboot, but we can't force all guests to update firmware.
>
> Here is where I disagree. I do think that this is the CoCo guest's
> responsibility (and by guest I include its firmware) to fix its own
> state after a reboot. How would the host even know that a guest is
> rebooting if it's a CoCo guest?

There are a bunch of complexities here, reboot sequence on x86 can be
triggered using multiple ways that I don't fully understand, but few
of them include reading/writing to "reset register" in MMIO/PCI config
space that are emulated by the host userspace directly. Host has to
know when the guest is shutting down to manage it's lifecycle.

x86 CoCo VM firmwares don't support warm/soft reboot and even if it
does in future, guest kernel can choose a different reboot mechanism.
So guest reboot needs to be emulated by always starting from scratch.
This sequence needs initial guest firmware payload to be installed
into private ranges of guest_memfd.

>
> Either the host doesn't (or cannot even) know that the guest is
> rebooting, in which case I don't see how having an IOCTL would help.

Host does know that the guest is rebooting.

> Or somehow the host does know that, i.e., via a hypercall that
> indicates that. In which case, we could have it so that for that type
> of VM, we would reconvert its pages to private on a reboot.

This possibly could be solved by resetting the ranges to private when
binding with a memslot of certain VM type. But then Google also has a
usecase to support intrahost migration where a live VM and associated
guest_memfd files are bound to new KVM VM and memslots.

Otherwise, we need an additional contract between userspace/KVM to
intercept/handle guest_memfd range reset.
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Fuad Tabba 7 months ago
Hi Vishal,

On Wed, 21 May 2025 at 15:42, Vishal Annapurve <vannapurve@google.com> wrote:
>
> On Wed, May 21, 2025 at 5:36 AM Fuad Tabba <tabba@google.com> wrote:
> > ....
> > > When rebooting, the memslots may not yet be bound to the guest_memfd,
> > > but we want to reset the guest_memfd's to private. If we use
> > > KVM_SET_MEMORY_ATTRIBUTES to convert, we'd be forced to first bind, then
> > > convert. If we had a direct ioctl, we don't have this restriction.
> > >
> > > If we do the conversion via vcpu_run() we would be forced to handle
> > > conversions only with a vcpu_run() and only the guest can initiate a
> > > conversion.
> > >
> > > On a guest boot for TDX, the memory is assumed to be private. If the we
> > > gave it memory set as shared, we'd just have a bunch of
> > > KVM_EXIT_MEMORY_FAULTs that slow down boot. Hence on a guest reboot, we
> > > will want to reset the guest memory to private.
> > >
> > > We could say the firmware should reset memory to private on guest
> > > reboot, but we can't force all guests to update firmware.
> >
> > Here is where I disagree. I do think that this is the CoCo guest's
> > responsibility (and by guest I include its firmware) to fix its own
> > state after a reboot. How would the host even know that a guest is
> > rebooting if it's a CoCo guest?
>
> There are a bunch of complexities here, reboot sequence on x86 can be
> triggered using multiple ways that I don't fully understand, but few
> of them include reading/writing to "reset register" in MMIO/PCI config
> space that are emulated by the host userspace directly. Host has to
> know when the guest is shutting down to manage it's lifecycle.

In that case, I think we need to fully understand these complexities
before adding new IOCTLs. It could be that once we understand these
issues, we find that we don't need these IOCTLs. It's hard to justify
adding an IOCTL for something we don't understand.

> x86 CoCo VM firmwares don't support warm/soft reboot and even if it
> does in future, guest kernel can choose a different reboot mechanism.
> So guest reboot needs to be emulated by always starting from scratch.
> This sequence needs initial guest firmware payload to be installed
> into private ranges of guest_memfd.
>
> >
> > Either the host doesn't (or cannot even) know that the guest is
> > rebooting, in which case I don't see how having an IOCTL would help.
>
> Host does know that the guest is rebooting.

In that case, that (i.e., the host finding out that the guest is
rebooting) could trigger the conversion back to private. No need for
an IOCTL.

> > Or somehow the host does know that, i.e., via a hypercall that
> > indicates that. In which case, we could have it so that for that type
> > of VM, we would reconvert its pages to private on a reboot.
>
> This possibly could be solved by resetting the ranges to private when
> binding with a memslot of certain VM type. But then Google also has a
> usecase to support intrahost migration where a live VM and associated
> guest_memfd files are bound to new KVM VM and memslots.
>
> Otherwise, we need an additional contract between userspace/KVM to
> intercept/handle guest_memfd range reset.

Then this becomes a migration issue to be solved then, not a huge page
support issue. If such IOCTLs are needed for migration, it's too early
to add them now.

Cheers,
/fuad
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Vishal Annapurve 7 months ago
On Wed, May 21, 2025 at 8:22 AM Fuad Tabba <tabba@google.com> wrote:
>
> Hi Vishal,
>
> On Wed, 21 May 2025 at 15:42, Vishal Annapurve <vannapurve@google.com> wrote:
> >
> > On Wed, May 21, 2025 at 5:36 AM Fuad Tabba <tabba@google.com> wrote:
> > > ....
> > > > When rebooting, the memslots may not yet be bound to the guest_memfd,
> > > > but we want to reset the guest_memfd's to private. If we use
> > > > KVM_SET_MEMORY_ATTRIBUTES to convert, we'd be forced to first bind, then
> > > > convert. If we had a direct ioctl, we don't have this restriction.
> > > >
> > > > If we do the conversion via vcpu_run() we would be forced to handle
> > > > conversions only with a vcpu_run() and only the guest can initiate a
> > > > conversion.
> > > >
> > > > On a guest boot for TDX, the memory is assumed to be private. If the we
> > > > gave it memory set as shared, we'd just have a bunch of
> > > > KVM_EXIT_MEMORY_FAULTs that slow down boot. Hence on a guest reboot, we
> > > > will want to reset the guest memory to private.
> > > >
> > > > We could say the firmware should reset memory to private on guest
> > > > reboot, but we can't force all guests to update firmware.
> > >
> > > Here is where I disagree. I do think that this is the CoCo guest's
> > > responsibility (and by guest I include its firmware) to fix its own
> > > state after a reboot. How would the host even know that a guest is
> > > rebooting if it's a CoCo guest?
> >
> > There are a bunch of complexities here, reboot sequence on x86 can be
> > triggered using multiple ways that I don't fully understand, but few
> > of them include reading/writing to "reset register" in MMIO/PCI config
> > space that are emulated by the host userspace directly. Host has to
> > know when the guest is shutting down to manage it's lifecycle.
>
> In that case, I think we need to fully understand these complexities
> before adding new IOCTLs. It could be that once we understand these
> issues, we find that we don't need these IOCTLs. It's hard to justify
> adding an IOCTL for something we don't understand.
>

I don't understand all the ways x86 guest can trigger reboot but I do
know that x86 CoCo linux guest kernel triggers reset using MMIO/PCI
config register write that is emulated by host userspace.

> > x86 CoCo VM firmwares don't support warm/soft reboot and even if it
> > does in future, guest kernel can choose a different reboot mechanism.
> > So guest reboot needs to be emulated by always starting from scratch.
> > This sequence needs initial guest firmware payload to be installed
> > into private ranges of guest_memfd.
> >
> > >
> > > Either the host doesn't (or cannot even) know that the guest is
> > > rebooting, in which case I don't see how having an IOCTL would help.
> >
> > Host does know that the guest is rebooting.
>
> In that case, that (i.e., the host finding out that the guest is
> rebooting) could trigger the conversion back to private. No need for
> an IOCTL.

In the reboot scenarios, it's the host userspace finding out that the
guest kernel wants to reboot.

>
> > > Or somehow the host does know that, i.e., via a hypercall that
> > > indicates that. In which case, we could have it so that for that type
> > > of VM, we would reconvert its pages to private on a reboot.
> >
> > This possibly could be solved by resetting the ranges to private when
> > binding with a memslot of certain VM type. But then Google also has a
> > usecase to support intrahost migration where a live VM and associated
> > guest_memfd files are bound to new KVM VM and memslots.
> >
> > Otherwise, we need an additional contract between userspace/KVM to
> > intercept/handle guest_memfd range reset.
>
> Then this becomes a migration issue to be solved then, not a huge page
> support issue. If such IOCTLs are needed for migration, it's too early
> to add them now.

The guest_memfd ioctl is not needed for migration but to change/reset
guest_memfd range attributes. I am saying that migration usecase can
conflict with some ways that we can solve resetting guest_memfd range
attributes without adding a new IOCTL as migration closely resembles
reboot scenario as both of them can/need reusing the same guest memory
files but one needs to preserve guest memory state.

Reiterating my understanding here, guest memfd ioctl can be used by
host userspace to -
1) Change guest memfd range attributes during memory conversion
     - This can be handled by KVM hypercall exits in theory as you are
suggesting but Ackerley and me are still thinking that this is a
memory operation that goes beyond vcpu scope and will involve
interaction with IOMMU backend as well, it's cleaner to have a
separate guest memfd specific ioctl for this operation as the impact
is even beyond KVM.

2) Reset guest memfd range attributes during guest reboot to allow
reusing the same guest memfd files.
    - This helps reset the range state to private as needed inline
with initial shared/private configuration chosen at the guest memfd
creation.
    - This also helps reconstitute all the huge pages back to their
original state that may have gotten split during the runtime of the
guest.
  This is a host initiated request for guest memfd memory conversion
that we should not be overloading with other KVM interactions in my
opinion.

>
> Cheers,
> /fuad
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Fuad Tabba 7 months ago
Hi Vishal,

On Wed, 21 May 2025 at 16:51, Vishal Annapurve <vannapurve@google.com> wrote:
>
> On Wed, May 21, 2025 at 8:22 AM Fuad Tabba <tabba@google.com> wrote:
> >
> > Hi Vishal,
> >
> > On Wed, 21 May 2025 at 15:42, Vishal Annapurve <vannapurve@google.com> wrote:
> > >
> > > On Wed, May 21, 2025 at 5:36 AM Fuad Tabba <tabba@google.com> wrote:
> > > > ....
> > > > > When rebooting, the memslots may not yet be bound to the guest_memfd,
> > > > > but we want to reset the guest_memfd's to private. If we use
> > > > > KVM_SET_MEMORY_ATTRIBUTES to convert, we'd be forced to first bind, then
> > > > > convert. If we had a direct ioctl, we don't have this restriction.
> > > > >
> > > > > If we do the conversion via vcpu_run() we would be forced to handle
> > > > > conversions only with a vcpu_run() and only the guest can initiate a
> > > > > conversion.
> > > > >
> > > > > On a guest boot for TDX, the memory is assumed to be private. If the we
> > > > > gave it memory set as shared, we'd just have a bunch of
> > > > > KVM_EXIT_MEMORY_FAULTs that slow down boot. Hence on a guest reboot, we
> > > > > will want to reset the guest memory to private.
> > > > >
> > > > > We could say the firmware should reset memory to private on guest
> > > > > reboot, but we can't force all guests to update firmware.
> > > >
> > > > Here is where I disagree. I do think that this is the CoCo guest's
> > > > responsibility (and by guest I include its firmware) to fix its own
> > > > state after a reboot. How would the host even know that a guest is
> > > > rebooting if it's a CoCo guest?
> > >
> > > There are a bunch of complexities here, reboot sequence on x86 can be
> > > triggered using multiple ways that I don't fully understand, but few
> > > of them include reading/writing to "reset register" in MMIO/PCI config
> > > space that are emulated by the host userspace directly. Host has to
> > > know when the guest is shutting down to manage it's lifecycle.
> >
> > In that case, I think we need to fully understand these complexities
> > before adding new IOCTLs. It could be that once we understand these
> > issues, we find that we don't need these IOCTLs. It's hard to justify
> > adding an IOCTL for something we don't understand.
> >
>
> I don't understand all the ways x86 guest can trigger reboot but I do
> know that x86 CoCo linux guest kernel triggers reset using MMIO/PCI
> config register write that is emulated by host userspace.
>
> > > x86 CoCo VM firmwares don't support warm/soft reboot and even if it
> > > does in future, guest kernel can choose a different reboot mechanism.
> > > So guest reboot needs to be emulated by always starting from scratch.
> > > This sequence needs initial guest firmware payload to be installed
> > > into private ranges of guest_memfd.
> > >
> > > >
> > > > Either the host doesn't (or cannot even) know that the guest is
> > > > rebooting, in which case I don't see how having an IOCTL would help.
> > >
> > > Host does know that the guest is rebooting.
> >
> > In that case, that (i.e., the host finding out that the guest is
> > rebooting) could trigger the conversion back to private. No need for
> > an IOCTL.
>
> In the reboot scenarios, it's the host userspace finding out that the
> guest kernel wants to reboot.

How does the host userspace find that out? If the host userspace is
capable of finding that out, then surely KVM is also capable of
finding out the same.


> >
> > > > Or somehow the host does know that, i.e., via a hypercall that
> > > > indicates that. In which case, we could have it so that for that type
> > > > of VM, we would reconvert its pages to private on a reboot.
> > >
> > > This possibly could be solved by resetting the ranges to private when
> > > binding with a memslot of certain VM type. But then Google also has a
> > > usecase to support intrahost migration where a live VM and associated
> > > guest_memfd files are bound to new KVM VM and memslots.
> > >
> > > Otherwise, we need an additional contract between userspace/KVM to
> > > intercept/handle guest_memfd range reset.
> >
> > Then this becomes a migration issue to be solved then, not a huge page
> > support issue. If such IOCTLs are needed for migration, it's too early
> > to add them now.
>
> The guest_memfd ioctl is not needed for migration but to change/reset
> guest_memfd range attributes. I am saying that migration usecase can
> conflict with some ways that we can solve resetting guest_memfd range
> attributes without adding a new IOCTL as migration closely resembles
> reboot scenario as both of them can/need reusing the same guest memory
> files but one needs to preserve guest memory state.
>
> Reiterating my understanding here, guest memfd ioctl can be used by
> host userspace to -
> 1) Change guest memfd range attributes during memory conversion
>      - This can be handled by KVM hypercall exits in theory as you are
> suggesting but Ackerley and me are still thinking that this is a
> memory operation that goes beyond vcpu scope and will involve
> interaction with IOMMU backend as well, it's cleaner to have a
> separate guest memfd specific ioctl for this operation as the impact
> is even beyond KVM.

The IOMMU backend needs to know about the sharing/unsharing, not
trigger it. The memory is the guest's. We already have a mechanism for
informing userspace of these kinds of events with KVM exits. This
doesn't justify adding a new IOCTL.

> 2) Reset guest memfd range attributes during guest reboot to allow
> reusing the same guest memfd files.
>     - This helps reset the range state to private as needed inline
> with initial shared/private configuration chosen at the guest memfd
> creation.
>     - This also helps reconstitute all the huge pages back to their
> original state that may have gotten split during the runtime of the
> guest.
>   This is a host initiated request for guest memfd memory conversion
> that we should not be overloading with other KVM interactions in my
> opinion.

Then, we could argue about whether we need a "reset" IOCTL (not that I
am arguing for that). But still, like I said, if the host becomes
aware that the confidential guest is rebooting, then surely KVM can be
made aware.

I wonder if this might be better suited for the biweekly guest_memfd sync.

Cheers,
/fuad
> >
> > Cheers,
> > /fuad
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Sean Christopherson 6 months, 4 weeks ago
On Wed, May 21, 2025, Fuad Tabba wrote:
> On Wed, 21 May 2025 at 16:51, Vishal Annapurve <vannapurve@google.com> wrote:
> > On Wed, May 21, 2025 at 8:22 AM Fuad Tabba <tabba@google.com> wrote:
> > > On Wed, 21 May 2025 at 15:42, Vishal Annapurve <vannapurve@google.com> wrote:
> > > > On Wed, May 21, 2025 at 5:36 AM Fuad Tabba <tabba@google.com> wrote:
> > > > There are a bunch of complexities here, reboot sequence on x86 can be
> > > > triggered using multiple ways that I don't fully understand, but few
> > > > of them include reading/writing to "reset register" in MMIO/PCI config
> > > > space that are emulated by the host userspace directly. Host has to
> > > > know when the guest is shutting down to manage it's lifecycle.
> > >
> > > In that case, I think we need to fully understand these complexities
> > > before adding new IOCTLs. It could be that once we understand these
> > > issues, we find that we don't need these IOCTLs. It's hard to justify
> > > adding an IOCTL for something we don't understand.
> > >
> >
> > I don't understand all the ways x86 guest can trigger reboot but I do
> > know that x86 CoCo linux guest kernel triggers reset using MMIO/PCI
> > config register write that is emulated by host userspace.
> >
> > > > x86 CoCo VM firmwares don't support warm/soft reboot and even if it
> > > > does in future, guest kernel can choose a different reboot mechanism.
> > > > So guest reboot needs to be emulated by always starting from scratch.
> > > > This sequence needs initial guest firmware payload to be installed
> > > > into private ranges of guest_memfd.
> > > >
> > > > >
> > > > > Either the host doesn't (or cannot even) know that the guest is
> > > > > rebooting, in which case I don't see how having an IOCTL would help.
> > > >
> > > > Host does know that the guest is rebooting.
> > >
> > > In that case, that (i.e., the host finding out that the guest is
> > > rebooting) could trigger the conversion back to private. No need for an
> > > IOCTL.
> >
> > In the reboot scenarios, it's the host userspace finding out that the guest
> > kernel wants to reboot.
> 
> How does the host userspace find that out? If the host userspace is capable
> of finding that out, then surely KVM is also capable of finding out the same.

Nope, not on x86.  Well, not without userspace invoking a new ioctl, which would
defeat the purpose of adding these ioctls.

KVM is only responsible for emulating/virtualizing the "CPU".  The chipset, e.g.
the PCI config space, is fully owned by userspace.  KVM doesn't even know whether
or not PCI exists for the VM.  And reboot may be emulated by simply creating a
new KVM instance, i.e. even if KVM was somehow aware of the reboot request, the
change in state would happen in an entirely new struct kvm.

That said, Vishal and Ackerley, this patch is a bit lacking on the documentation
front.  The changelog asserts that:

  A guest_memfd ioctl is used because shareability is a property of the memory,
  and this property should be modifiable independently of the attached struct kvm

but then follows with a very weak and IMO largely irrelevant justification of:

  This allows shareability to be modified even if the memory is not yet bound
  using memslots.

Allowing userspace to change shareability without memslots is one relatively minor
flow in one very specific use case.

The real justification for these ioctls is that fundamentally, shareability for
in-place conversions is a property of a guest_memfd instance and not a struct kvm
instance, and so needs to owned by guest_memfd.

I.e. focus on justifying the change from a design and conceptual perspective,
not from a mechanical perspective of a flow that likely's somewhat unique to our
specific environment.  Y'all are getting deep into the weeds on a random aspect
of x86 platform architecture, instead of focusing on the overall design.

The other issue that's likely making this more confusing than it needs to be is
that this series is actually two completely different series bundled into one,
with very little explanation.  Moving shared vs. private ownership into
guest_memfd isn't a requirement for 1GiB support, it's a requirement for in-place
shared/private conversion in guest_memfd.

For the current guest_memfd implementation, shared vs. private is tracked in the
VM via memory attributes, because a guest_memfd instance is *only* private.  I.e.
shared vs. private is a property of the VM, not of the guest_memfd instance.  But
when in-place conversion support comes along, ownership of that particular
attribute needs to shift to the guest_memfd instance.

I know I gave feedback on earlier posting about there being too series flying
around, but shoving two distinct concepts into a single series is not the answer.
My complaints about too much noise wasn't that there were multiple series, it was
that there was very little coordination and lots of chaos.

If you split this series in two, which should be trivial since you've already
organized the patches as a split, then sans the selftests (thank you for those!),
in-place conversion support will be its own (much smaller!) series that can focus
on that specific aspect of the design, and can provide a cover letter that
expounds on the design goals and uAPI.

  KVM: guest_memfd: Add CAP KVM_CAP_GMEM_CONVERSION
  KVM: Query guest_memfd for private/shared status
  KVM: guest_memfd: Skip LRU for guest_memfd folios
  KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
  KVM: guest_memfd: Introduce and use shareability to guard faulting
  KVM: guest_memfd: Make guest mem use guest mem inodes instead of anonymous inodes

And then you can post the 1GiB series separately.  So long as you provide pointers
to dependencies along with a link to a repo+branch with the kitchen sink, I won't
complain about things being too chaotic :-)
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Fuad Tabba 6 months, 4 weeks ago
Hi Sean,

On Thu, 22 May 2025 at 15:52, Sean Christopherson <seanjc@google.com> wrote:
>
> On Wed, May 21, 2025, Fuad Tabba wrote:
> > On Wed, 21 May 2025 at 16:51, Vishal Annapurve <vannapurve@google.com> wrote:
> > > On Wed, May 21, 2025 at 8:22 AM Fuad Tabba <tabba@google.com> wrote:
> > > > On Wed, 21 May 2025 at 15:42, Vishal Annapurve <vannapurve@google.com> wrote:
> > > > > On Wed, May 21, 2025 at 5:36 AM Fuad Tabba <tabba@google.com> wrote:
> > > > > There are a bunch of complexities here, reboot sequence on x86 can be
> > > > > triggered using multiple ways that I don't fully understand, but few
> > > > > of them include reading/writing to "reset register" in MMIO/PCI config
> > > > > space that are emulated by the host userspace directly. Host has to
> > > > > know when the guest is shutting down to manage it's lifecycle.
> > > >
> > > > In that case, I think we need to fully understand these complexities
> > > > before adding new IOCTLs. It could be that once we understand these
> > > > issues, we find that we don't need these IOCTLs. It's hard to justify
> > > > adding an IOCTL for something we don't understand.
> > > >
> > >
> > > I don't understand all the ways x86 guest can trigger reboot but I do
> > > know that x86 CoCo linux guest kernel triggers reset using MMIO/PCI
> > > config register write that is emulated by host userspace.
> > >
> > > > > x86 CoCo VM firmwares don't support warm/soft reboot and even if it
> > > > > does in future, guest kernel can choose a different reboot mechanism.
> > > > > So guest reboot needs to be emulated by always starting from scratch.
> > > > > This sequence needs initial guest firmware payload to be installed
> > > > > into private ranges of guest_memfd.
> > > > >
> > > > > >
> > > > > > Either the host doesn't (or cannot even) know that the guest is
> > > > > > rebooting, in which case I don't see how having an IOCTL would help.
> > > > >
> > > > > Host does know that the guest is rebooting.
> > > >
> > > > In that case, that (i.e., the host finding out that the guest is
> > > > rebooting) could trigger the conversion back to private. No need for an
> > > > IOCTL.
> > >
> > > In the reboot scenarios, it's the host userspace finding out that the guest
> > > kernel wants to reboot.
> >
> > How does the host userspace find that out? If the host userspace is capable
> > of finding that out, then surely KVM is also capable of finding out the same.
>
> Nope, not on x86.  Well, not without userspace invoking a new ioctl, which would
> defeat the purpose of adding these ioctls.
>
> KVM is only responsible for emulating/virtualizing the "CPU".  The chipset, e.g.
> the PCI config space, is fully owned by userspace.  KVM doesn't even know whether
> or not PCI exists for the VM.  And reboot may be emulated by simply creating a
> new KVM instance, i.e. even if KVM was somehow aware of the reboot request, the
> change in state would happen in an entirely new struct kvm.
>
> That said, Vishal and Ackerley, this patch is a bit lacking on the documentation
> front.  The changelog asserts that:
>
>   A guest_memfd ioctl is used because shareability is a property of the memory,
>   and this property should be modifiable independently of the attached struct kvm
>
> but then follows with a very weak and IMO largely irrelevant justification of:
>
>   This allows shareability to be modified even if the memory is not yet bound
>   using memslots.
>
> Allowing userspace to change shareability without memslots is one relatively minor
> flow in one very specific use case.
>
> The real justification for these ioctls is that fundamentally, shareability for
> in-place conversions is a property of a guest_memfd instance and not a struct kvm
> instance, and so needs to owned by guest_memfd.

Thanks for the clarification Sean. I have a couple of followup
questions/comments that you might be able to help with:

From a conceptual point of view, I understand that the in-place
conversion is a property of guest_memfd. But that doesn't necessarily
mean that the interface between kvm <-> guest_memfd is a userspace
IOCTL. We already communicate directly between the two. Other, even
less related subsystems within the kernel also interact without going
through userspace. Why can't we do the same here? I'm not suggesting
it not be owned by guest_memfd, but that we communicate directly.

From a performance point of view, I would expect the common case to be
that when KVM gets an unshare request from the guest, it would be able
to unmap those pages from the (cooperative) host userspace, and return
back to the guest. In this scenario, the host userspace wouldn't even
need to be involved. Having a userspace IOCTL as part of this makes
that trip unnecessarily longer for the common case.

Cheers,
/fuad

> I.e. focus on justifying the change from a design and conceptual perspective,
> not from a mechanical perspective of a flow that likely's somewhat unique to our
> specific environment.  Y'all are getting deep into the weeds on a random aspect
> of x86 platform architecture, instead of focusing on the overall design.
>
> The other issue that's likely making this more confusing than it needs to be is
> that this series is actually two completely different series bundled into one,
> with very little explanation.  Moving shared vs. private ownership into
> guest_memfd isn't a requirement for 1GiB support, it's a requirement for in-place
> shared/private conversion in guest_memfd.
>
> For the current guest_memfd implementation, shared vs. private is tracked in the
> VM via memory attributes, because a guest_memfd instance is *only* private.  I.e.
> shared vs. private is a property of the VM, not of the guest_memfd instance.  But
> when in-place conversion support comes along, ownership of that particular
> attribute needs to shift to the guest_memfd instance.
>
> I know I gave feedback on earlier posting about there being too series flying
> around, but shoving two distinct concepts into a single series is not the answer.
> My complaints about too much noise wasn't that there were multiple series, it was
> that there was very little coordination and lots of chaos.
>
> If you split this series in two, which should be trivial since you've already
> organized the patches as a split, then sans the selftests (thank you for those!),
> in-place conversion support will be its own (much smaller!) series that can focus
> on that specific aspect of the design, and can provide a cover letter that
> expounds on the design goals and uAPI.
>
>   KVM: guest_memfd: Add CAP KVM_CAP_GMEM_CONVERSION
>   KVM: Query guest_memfd for private/shared status
>   KVM: guest_memfd: Skip LRU for guest_memfd folios
>   KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
>   KVM: guest_memfd: Introduce and use shareability to guard faulting
>   KVM: guest_memfd: Make guest mem use guest mem inodes instead of anonymous inodes
>
> And then you can post the 1GiB series separately.  So long as you provide pointers
> to dependencies along with a link to a repo+branch with the kitchen sink, I won't
> complain about things being too chaotic :-)
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Sean Christopherson 6 months, 4 weeks ago
On Thu, May 22, 2025, Fuad Tabba wrote:
> On Thu, 22 May 2025 at 15:52, Sean Christopherson <seanjc@google.com> wrote:
> > On Wed, May 21, 2025, Fuad Tabba wrote:
> > > How does the host userspace find that out? If the host userspace is capable
> > > of finding that out, then surely KVM is also capable of finding out the same.
> >
> > Nope, not on x86.  Well, not without userspace invoking a new ioctl, which would
> > defeat the purpose of adding these ioctls.
> >
> > KVM is only responsible for emulating/virtualizing the "CPU".  The chipset, e.g.
> > the PCI config space, is fully owned by userspace.  KVM doesn't even know whether
> > or not PCI exists for the VM.  And reboot may be emulated by simply creating a
> > new KVM instance, i.e. even if KVM was somehow aware of the reboot request, the
> > change in state would happen in an entirely new struct kvm.
> >
> > That said, Vishal and Ackerley, this patch is a bit lacking on the documentation
> > front.  The changelog asserts that:
> >
> >   A guest_memfd ioctl is used because shareability is a property of the memory,
> >   and this property should be modifiable independently of the attached struct kvm
> >
> > but then follows with a very weak and IMO largely irrelevant justification of:
> >
> >   This allows shareability to be modified even if the memory is not yet bound
> >   using memslots.
> >
> > Allowing userspace to change shareability without memslots is one relatively minor
> > flow in one very specific use case.
> >
> > The real justification for these ioctls is that fundamentally, shareability for
> > in-place conversions is a property of a guest_memfd instance and not a struct kvm
> > instance, and so needs to owned by guest_memfd.
> 
> Thanks for the clarification Sean. I have a couple of followup
> questions/comments that you might be able to help with:
> 
> From a conceptual point of view, I understand that the in-place conversion is
> a property of guest_memfd. But that doesn't necessarily mean that the
> interface between kvm <-> guest_memfd is a userspace IOCTL.

kvm and guest_memfd aren't the communication endpoints for in-place conversions,
and more importantly, kvm isn't part of the control plane.  kvm's primary role
(for guest_memfd with in-place conversions) is to manage the page tables to map
memory into the guest.

kvm *may* also explicitly provide a communication channel between the guest and
host, e.g. when conversions are initiated via hypercalls, but in some cases the
communication channel may be created through pre-existing mechanisms, e.g. a
shared memory buffer or emulated I/O (such as the PCI reset case).

  guest => kvm (dumb pipe) => userspace => guest_memfd => kvm (invalidate)

And in other cases, kvm might not be in that part of the picture at all, e.g. if
the userspace VMM provides an interface to the VM owner (which could also be the
user running the VM) to reset the VM, then the flow would look like:

  userspace => guest_memfd => kvm (invalidate)

A decent comparison is vCPUs.  KVM _could_ route all ioctls through the VM, but
that's unpleasant for all parties, as it'd be cumbersome for userspace, and
unnecessarily complex and messy for KVM.  Similarly, routing guest_memfd state
changes through KVM_SET_MEMORY_ATTRIBUTES is awkward from both design and mechanical
perspectives.

Even if we disagree on how ugly/pretty routing conversions through kvm would be,
which I'll allow is subjective, the bigger problem is that bouncing through
KVM_SET_MEMORY_ATTRIBUTES would create an unholy mess of an ABI.

Today, KVM_SET_MEMORY_ATTRIBUTES is handled entirely within kvm, and any changes
take effect irrespective of any memslot bindings.  And that didn't happen by
chance; preserving and enforcing attribute changes independently of memslots was
a key design requirement, precisely because memslots are ephemeral to a certain
extent.

Adding support for in-place guest_memfd conversion will require new ABI, and so
will be a "breaking" change for KVM_SET_MEMORY_ATTRIBUTES no matter what.  E.g.
KVM will need to reject KVM_MEMORY_ATTRIBUTE_PRIVATE for VMs that elect to use
in-place guest_memfd conversions.  But very critically, KVM can cripsly enumerate
the lack of KVM_MEMORY_ATTRIBUTE_PRIVATE via KVM_CAP_MEMORY_ATTRIBUTES, the
behavior will be very straightforward to document (e.g. CAP X is mutually excusive
with KVM_MEMORY_ATTRIBUTE_PRIVATE), and it will be opt-in, i.e. won't truly be a
breaking change.

If/when we move shareability to guest_memfd, routing state changes through
KVM_SET_MEMORY_ATTRIBUTES will gain a subtle dependency on userspace having to
create memslots in order for state changes to take effect.  That wrinkle would be
weird and annoying to document, e.g. "if CAP X is enabled, the ioctl ordering is
A => B => C, otherwise the ordering doesn't matter", and would create many more
conundrums:

  - If a memslot needs to exist in order for KVM_SET_MEMORY_ATTRIBUTES to take effect,
    what should happen if that memslot is deleted?
  - If a memslot isn't found, should KVM_SET_MEMORY_ATTRIBUTES fail and report
    an error, or silently do nothing?
  - If KVM_SET_MEMORY_ATTRIBUTES affects multiple memslots that are bound to
    multiple guest_memfd, how does KVM guarantee atomicity?  What happens if one
    guest_memfd conversion succeeds, but a later fails?

> We already communicate directly between the two. Other, even less related
> subsystems within the kernel also interact without going through userspace.
> Why can't we do the same here? I'm not suggesting it not be owned by
> guest_memfd, but that we communicate directly.

I'm not concerned about kvm communicating with guest_memfd, as you note it's all
KVM.  As above, my concerns are all about KVM's ABI and who owns/controls what.

> From a performance point of view, I would expect the common case to be that
> when KVM gets an unshare request from the guest, it would be able to unmap
> those pages from the (cooperative) host userspace, and return back to the
> guest. In this scenario, the host userspace wouldn't even need to be
> involved.

Hard NAK, at least from an x86 perspective.  Userspace is the sole decision maker
with respect to what memory is state of shared vs. private, full stop.  The guest
can make *requests* to convert memory, but ultimately it's host userspace that
decides whether or not to honor the request.

We've litigated this exact issue multiple times.  All state changes must be
controlled by userspace, because userspace is the only entity that can gracefully
handle exceptions and edge cases, and is the only entity with (almost) full
knowledge of the system.  We can discuss this again if necessary, but I'd much
prefer to not rehash all of those conversations.

> Having a userspace IOCTL as part of this makes that trip unnecessarily longer
> for the common case.

I'm very skeptical that an exit to userspace is going to even be measurable in
terms of the cost to convert memory.  Conversion is going to require multiple
locks, modifications to multiple sets of page tables with all the associated TLB
maintenance, possibly cache maintenance, and probably a few other things I'm
forgetting.  The cost of a few user<=>kernel transitions is likely going to be a
drop in the bucket.

If I'm wrong, and there are flows where the user<=>kernel transitions are the
long pole, then we could certainly exploring adding a way for userspace to opt
into a "fast path" conversion.  But it would need to be exactly that, an optional
fast path that can fall back to the "slow" userspace-driven conversion as needed.
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Fuad Tabba 6 months, 4 weeks ago
Hi Sean,


On Thu, 22 May 2025 at 17:26, Sean Christopherson <seanjc@google.com> wrote:
>
> On Thu, May 22, 2025, Fuad Tabba wrote:
> > On Thu, 22 May 2025 at 15:52, Sean Christopherson <seanjc@google.com> wrote:
> > > On Wed, May 21, 2025, Fuad Tabba wrote:
> > > > How does the host userspace find that out? If the host userspace is capable
> > > > of finding that out, then surely KVM is also capable of finding out the same.
> > >
> > > Nope, not on x86.  Well, not without userspace invoking a new ioctl, which would
> > > defeat the purpose of adding these ioctls.
> > >
> > > KVM is only responsible for emulating/virtualizing the "CPU".  The chipset, e.g.
> > > the PCI config space, is fully owned by userspace.  KVM doesn't even know whether
> > > or not PCI exists for the VM.  And reboot may be emulated by simply creating a
> > > new KVM instance, i.e. even if KVM was somehow aware of the reboot request, the
> > > change in state would happen in an entirely new struct kvm.
> > >
> > > That said, Vishal and Ackerley, this patch is a bit lacking on the documentation
> > > front.  The changelog asserts that:
> > >
> > >   A guest_memfd ioctl is used because shareability is a property of the memory,
> > >   and this property should be modifiable independently of the attached struct kvm
> > >
> > > but then follows with a very weak and IMO largely irrelevant justification of:
> > >
> > >   This allows shareability to be modified even if the memory is not yet bound
> > >   using memslots.
> > >
> > > Allowing userspace to change shareability without memslots is one relatively minor
> > > flow in one very specific use case.
> > >
> > > The real justification for these ioctls is that fundamentally, shareability for
> > > in-place conversions is a property of a guest_memfd instance and not a struct kvm
> > > instance, and so needs to owned by guest_memfd.
> >
> > Thanks for the clarification Sean. I have a couple of followup
> > questions/comments that you might be able to help with:
> >
> > From a conceptual point of view, I understand that the in-place conversion is
> > a property of guest_memfd. But that doesn't necessarily mean that the
> > interface between kvm <-> guest_memfd is a userspace IOCTL.
>
> kvm and guest_memfd aren't the communication endpoints for in-place conversions,
> and more importantly, kvm isn't part of the control plane.  kvm's primary role
> (for guest_memfd with in-place conversions) is to manage the page tables to map
> memory into the guest.
>
> kvm *may* also explicitly provide a communication channel between the guest and
> host, e.g. when conversions are initiated via hypercalls, but in some cases the
> communication channel may be created through pre-existing mechanisms, e.g. a
> shared memory buffer or emulated I/O (such as the PCI reset case).
>
>   guest => kvm (dumb pipe) => userspace => guest_memfd => kvm (invalidate)
>
> And in other cases, kvm might not be in that part of the picture at all, e.g. if
> the userspace VMM provides an interface to the VM owner (which could also be the
> user running the VM) to reset the VM, then the flow would look like:
>
>   userspace => guest_memfd => kvm (invalidate)
>
> A decent comparison is vCPUs.  KVM _could_ route all ioctls through the VM, but
> that's unpleasant for all parties, as it'd be cumbersome for userspace, and
> unnecessarily complex and messy for KVM.  Similarly, routing guest_memfd state
> changes through KVM_SET_MEMORY_ATTRIBUTES is awkward from both design and mechanical
> perspectives.
>
> Even if we disagree on how ugly/pretty routing conversions through kvm would be,
> which I'll allow is subjective, the bigger problem is that bouncing through
> KVM_SET_MEMORY_ATTRIBUTES would create an unholy mess of an ABI.
>
> Today, KVM_SET_MEMORY_ATTRIBUTES is handled entirely within kvm, and any changes
> take effect irrespective of any memslot bindings.  And that didn't happen by
> chance; preserving and enforcing attribute changes independently of memslots was
> a key design requirement, precisely because memslots are ephemeral to a certain
> extent.
>
> Adding support for in-place guest_memfd conversion will require new ABI, and so
> will be a "breaking" change for KVM_SET_MEMORY_ATTRIBUTES no matter what.  E.g.
> KVM will need to reject KVM_MEMORY_ATTRIBUTE_PRIVATE for VMs that elect to use
> in-place guest_memfd conversions.  But very critically, KVM can cripsly enumerate
> the lack of KVM_MEMORY_ATTRIBUTE_PRIVATE via KVM_CAP_MEMORY_ATTRIBUTES, the
> behavior will be very straightforward to document (e.g. CAP X is mutually excusive
> with KVM_MEMORY_ATTRIBUTE_PRIVATE), and it will be opt-in, i.e. won't truly be a
> breaking change.
>
> If/when we move shareability to guest_memfd, routing state changes through
> KVM_SET_MEMORY_ATTRIBUTES will gain a subtle dependency on userspace having to
> create memslots in order for state changes to take effect.  That wrinkle would be
> weird and annoying to document, e.g. "if CAP X is enabled, the ioctl ordering is
> A => B => C, otherwise the ordering doesn't matter", and would create many more
> conundrums:
>
>   - If a memslot needs to exist in order for KVM_SET_MEMORY_ATTRIBUTES to take effect,
>     what should happen if that memslot is deleted?
>   - If a memslot isn't found, should KVM_SET_MEMORY_ATTRIBUTES fail and report
>     an error, or silently do nothing?
>   - If KVM_SET_MEMORY_ATTRIBUTES affects multiple memslots that are bound to
>     multiple guest_memfd, how does KVM guarantee atomicity?  What happens if one
>     guest_memfd conversion succeeds, but a later fails?
>
> > We already communicate directly between the two. Other, even less related
> > subsystems within the kernel also interact without going through userspace.
> > Why can't we do the same here? I'm not suggesting it not be owned by
> > guest_memfd, but that we communicate directly.
>
> I'm not concerned about kvm communicating with guest_memfd, as you note it's all
> KVM.  As above, my concerns are all about KVM's ABI and who owns/controls what.
>
> > From a performance point of view, I would expect the common case to be that
> > when KVM gets an unshare request from the guest, it would be able to unmap
> > those pages from the (cooperative) host userspace, and return back to the
> > guest. In this scenario, the host userspace wouldn't even need to be
> > involved.
>
> Hard NAK, at least from an x86 perspective.  Userspace is the sole decision maker
> with respect to what memory is state of shared vs. private, full stop.  The guest
> can make *requests* to convert memory, but ultimately it's host userspace that
> decides whether or not to honor the request.
>
> We've litigated this exact issue multiple times.  All state changes must be
> controlled by userspace, because userspace is the only entity that can gracefully
> handle exceptions and edge cases, and is the only entity with (almost) full
> knowledge of the system.  We can discuss this again if necessary, but I'd much
> prefer to not rehash all of those conversations.
>
> > Having a userspace IOCTL as part of this makes that trip unnecessarily longer
> > for the common case.
>
> I'm very skeptical that an exit to userspace is going to even be measurable in
> terms of the cost to convert memory.  Conversion is going to require multiple
> locks, modifications to multiple sets of page tables with all the associated TLB
> maintenance, possibly cache maintenance, and probably a few other things I'm
> forgetting.  The cost of a few user<=>kernel transitions is likely going to be a
> drop in the bucket.
>
> If I'm wrong, and there are flows where the user<=>kernel transitions are the
> long pole, then we could certainly exploring adding a way for userspace to opt
> into a "fast path" conversion.  But it would need to be exactly that, an optional
> fast path that can fall back to the "slow" userspace-driven conversion as needed.

Thanks for this very thorough explanation. I know that we have
litigated this issue, but not this _exact_ issue. My understanding was
that the main reason for using IOCTLs for memory attributes is that
userspace needs to manage private and shared memory seperately,
including allocation and punching holes where necessary.

That said, no need to discuss this again. If it turns out that
user<->kernel transitions are a bottleneck we could look into an
opt-in fast path as you said.

Cheers,
/fuad
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Ira Weiny 7 months ago
Ackerley Tng wrote:

[snip]

> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> 

[snip]

> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 590932499eba..f802116290ce 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -30,6 +30,10 @@ enum shareability {
>  };
>  
>  static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
> +static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> +				      pgoff_t end);
> +static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
> +				    pgoff_t end);
>  
>  static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
>  {
> @@ -85,6 +89,306 @@ static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t inde
>  	return kvm_gmem_get_folio(inode, index);
>  }
>  
> +/**
> + * kvm_gmem_shareability_store() - Sets shareability to @value for range.
> + *
> + * @mt: the shareability maple tree.
> + * @index: the range begins at this index in the inode.
> + * @nr_pages: number of PAGE_SIZE pages in this range.
> + * @value: the shareability value to set for this range.
> + *
> + * Unlike mtree_store_range(), this function also merges adjacent ranges that
> + * have the same values as an optimization.

Is this an optimization or something which will be required to convert
from shared back to private and back to a huge page mapping?

If this is purely an optimization it might be best to leave it out for now
to get functionality first.

I have more to review but wanted to ask this.

Ira

[snip]
Re: [RFC PATCH v2 04/51] KVM: guest_memfd: Introduce KVM_GMEM_CONVERT_SHARED/PRIVATE ioctls
Posted by Ackerley Tng 7 months ago
Ira Weiny <ira.weiny@intel.com> writes:

> Ackerley Tng wrote:
>
> [snip]
>
>> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
>> 
>
> [snip]
>
>> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
>> index 590932499eba..f802116290ce 100644
>> --- a/virt/kvm/guest_memfd.c
>> +++ b/virt/kvm/guest_memfd.c
>> @@ -30,6 +30,10 @@ enum shareability {
>>  };
>>  
>>  static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index);
>> +static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
>> +				      pgoff_t end);
>> +static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
>> +				    pgoff_t end);
>>  
>>  static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
>>  {
>> @@ -85,6 +89,306 @@ static struct folio *kvm_gmem_get_shared_folio(struct inode *inode, pgoff_t inde
>>  	return kvm_gmem_get_folio(inode, index);
>>  }
>>  
>> +/**
>> + * kvm_gmem_shareability_store() - Sets shareability to @value for range.
>> + *
>> + * @mt: the shareability maple tree.
>> + * @index: the range begins at this index in the inode.
>> + * @nr_pages: number of PAGE_SIZE pages in this range.
>> + * @value: the shareability value to set for this range.
>> + *
>> + * Unlike mtree_store_range(), this function also merges adjacent ranges that
>> + * have the same values as an optimization.
>
> Is this an optimization or something which will be required to convert
> from shared back to private and back to a huge page mapping?
>

This is an optimization.

> If this is purely an optimization it might be best to leave it out for now
> to get functionality first.
>

I see this (small) optimization as part of using maple trees.

Fuad's version [1] uses xarrays and has 1 xarray entry per page
offset. I wanted to illustrate that by using maple trees, we can share
just 1 entry for a whole range, and part of that sharing involves
merging adjacent shareability entries that have the same value.

IIUC, these other users of maple trees also do some kind of
expansion/range merging:

+ VMAs in vma_expand() [2]
+ regcache in regcache_maple_write() [3]

> I have more to review but wanted to ask this.
>
> Ira
>
> [snip]

[1] https://lore.kernel.org/all/20250328153133.3504118-4-tabba@google.com/
[2] https://elixir.bootlin.com/linux/v6.14.6/source/mm/vma.c#L1059
[3] https://elixir.bootlin.com/linux/v6.14.6/source/drivers/base/regmap/regcache-maple.c#L38