[RFC PATCH v3 3/6] kvm: gmem: implement direct map manipulation routines

Patrick Roy posted 6 patches 3 weeks, 4 days ago
[RFC PATCH v3 3/6] kvm: gmem: implement direct map manipulation routines
Posted by Patrick Roy 3 weeks, 4 days ago
Implement (yet unused) routines for manipulating guest_memfd direct map
state. This is largely for illustration purposes.

kvm_gmem_set_direct_map allows manipulating arbitrary pgoff_t
ranges, even if the covered memory has not yet been faulted in (in which
case the requested direct map state is recorded in the xarray and will
be applied by kvm_gmem_folio_configure_direct_map after the folio is
faulted in and prepared/populated). This can be used to realize
private/shared conversions on not-yet-faulted in memory, as discussed in
the guest_memfd upstream call [1].

kvm_gmem_folio_set_direct_map allows manipulating the direct map entries
for a gmem folio that the caller already holds a reference for (whereas
kvm_gmem_set_direct_map needs to look up all folios intersecting the
given pgoff range in the filemap first).

The xa lock serializes calls to kvm_gmem_folio_set_direct_map and
kvm_gmem_set_direct_map, while the read side
(kvm_gmem_folio_configure_direct_map) is protected by RCU. This is
sufficient to ensure consistency between the xarray and the folio's
actual direct map state, as kvm_gmem_folio_configure_direct_map is
called only for freshly allocated folios, and before the folio lock is
dropped for the first time, meaning kvm_gmem_folio_configure_direct_map
always does it's set_direct_map calls before either of
kvm_gmem_[folio_]set_direct_map get a chance. Even if a concurrent call
to kvm_gmem_[folio_]set_direct_map happens, this ensures a sort of
"eventual consistency" between xarray and actual direct map
configuration by the time kvm_gmem_[folio_]set_direct_map exits.

[1]: https://lore.kernel.org/kvm/4b49248b-1cf1-44dc-9b50-ee551e1671ac@redhat.com/

Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
---
 virt/kvm/guest_memfd.c | 125 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 50ffc2ad73eda..54387828dcc6a 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -96,6 +96,131 @@ static int kvm_gmem_folio_configure_direct_map(struct folio *folio)
 	return r;
 }
 
+/*
+ * Updates the range [@start, @end] in @gmem_priv's direct map state xarray to be @state,
+ * e.g. erasing entries in this range if @state is the default state, and creating
+ * entries otherwise.
+ *
+ * Assumes the xa_lock is held.
+ */
+static int __kvm_gmem_update_xarray(struct kvm_gmem_inode_private *gmem_priv, pgoff_t start,
+				    pgoff_t end, bool state)
+{
+	struct xarray *xa = &gmem_priv->direct_map_state;
+	int r = 0;
+
+	/*
+	 * Cannot use xa_store_range, as multi-indexes cannot easily
+	 * be partially updated.
+	 */
+	for (pgoff_t index = start; index < end; ++index) {
+		if (state == gmem_priv->default_direct_map_state)
+			__xa_erase(xa, index);
+		else
+			/* don't care _what_ we store in the xarray, only care about presence */
+			__xa_store(xa, index, gmem_priv, GFP_KERNEL);
+
+		r = xa_err(xa);
+		if (r)
+			goto out;
+	}
+
+out:
+	return r;
+}
+
+static int __kvm_gmem_folio_set_direct_map(struct folio *folio, pgoff_t start, pgoff_t end,
+					   bool state)
+{
+	unsigned long npages = end - start + 1;
+	struct page *first_page = folio_file_page(folio, start);
+
+	int r = set_direct_map_valid_noflush(first_page, npages, state);
+
+	flush_tlb_kernel_range((unsigned long)page_address(first_page),
+			       (unsigned long)page_address(first_page) +
+				       npages * PAGE_SIZE);
+	return r;
+}
+
+/*
+ * Updates the direct map status for the given range from @start to @end (inclusive), returning
+ * -EINVAL if this range is not completely contained within @folio. Also updates the
+ * xarray stored in the private data of the inode @folio is attached to.
+ *
+ * Takes and drops the folio lock.
+ */
+static __always_unused int kvm_gmem_folio_set_direct_map(struct folio *folio, pgoff_t start,
+								 pgoff_t end, bool state)
+{
+	struct inode *inode = folio_inode(folio);
+	struct kvm_gmem_inode_private *gmem_priv = inode->i_private;
+	int r = -EINVAL;
+
+	if (!folio_contains(folio, start) || !folio_contains(folio, end))
+		goto out;
+
+	xa_lock(&gmem_priv->direct_map_state);
+	r = __kvm_gmem_update_xarray(gmem_priv, start, end, state);
+	if (r)
+		goto unlock_xa;
+
+	folio_lock(folio);
+	r = __kvm_gmem_folio_set_direct_map(folio, start, end, state);
+	folio_unlock(folio);
+
+unlock_xa:
+	xa_unlock(&gmem_priv->direct_map_state);
+out:
+	return r;
+}
+
+/*
+ * Updates the direct map status for the given range from @start to @end (inclusive)
+ * of @inode. Folios in this range have their direct map entries reconfigured,
+ * and the xarray in the @inode's private data is updated.
+ */
+static __always_unused int kvm_gmem_set_direct_map(struct inode *inode, pgoff_t start,
+							   pgoff_t end, bool state)
+{
+	struct kvm_gmem_inode_private *gmem_priv = inode->i_private;
+	struct folio_batch fbatch;
+	pgoff_t index = start;
+	unsigned int count, i;
+	int r = 0;
+
+	xa_lock(&gmem_priv->direct_map_state);
+
+	r = __kvm_gmem_update_xarray(gmem_priv, start, end, state);
+	if (r)
+		goto out;
+
+	folio_batch_init(&fbatch);
+	while (!filemap_get_folios(inode->i_mapping, &index, end, &fbatch) && !r) {
+		count = folio_batch_count(&fbatch);
+		for (i = 0; i < count; i++) {
+			struct folio *folio = fbatch.folios[i];
+			pgoff_t folio_start = max(folio_index(folio), start);
+			pgoff_t folio_end =
+				min(folio_index(folio) + folio_nr_pages(folio),
+				    end);
+
+			folio_lock(folio);
+			r = __kvm_gmem_folio_set_direct_map(folio, folio_start,
+							    folio_end, state);
+			folio_unlock(folio);
+
+			if (r)
+				break;
+		}
+		folio_batch_release(&fbatch);
+	}
+
+	xa_unlock(&gmem_priv->direct_map_state);
+out:
+	return r;
+}
+
 /**
  * folio_file_pfn - like folio_file_page, but return a pfn.
  * @folio: The folio which contains this index.
-- 
2.47.0
Re: [RFC PATCH v3 3/6] kvm: gmem: implement direct map manipulation routines
Posted by Mike Day 3 weeks, 3 days ago

On 10/30/24 08:49, Patrick Roy wrote:
> Implement (yet unused) routines for manipulating guest_memfd direct map
> state. This is largely for illustration purposes.
> 
> kvm_gmem_set_direct_map allows manipulating arbitrary pgoff_t
> ranges, even if the covered memory has not yet been faulted in (in which
> case the requested direct map state is recorded in the xarray and will
> be applied by kvm_gmem_folio_configure_direct_map after the folio is
> faulted in and prepared/populated). This can be used to realize
> private/shared conversions on not-yet-faulted in memory, as discussed in
> the guest_memfd upstream call [1].
> 
> kvm_gmem_folio_set_direct_map allows manipulating the direct map entries
> for a gmem folio that the caller already holds a reference for (whereas
> kvm_gmem_set_direct_map needs to look up all folios intersecting the
> given pgoff range in the filemap first).
> 
> The xa lock serializes calls to kvm_gmem_folio_set_direct_map and
> kvm_gmem_set_direct_map, while the read side
> (kvm_gmem_folio_configure_direct_map) is protected by RCU. This is
> sufficient to ensure consistency between the xarray and the folio's
> actual direct map state, as kvm_gmem_folio_configure_direct_map is
> called only for freshly allocated folios, and before the folio lock is
> dropped for the first time, meaning kvm_gmem_folio_configure_direct_map
> always does it's set_direct_map calls before either of
> kvm_gmem_[folio_]set_direct_map get a chance. Even if a concurrent call
> to kvm_gmem_[folio_]set_direct_map happens, this ensures a sort of
> "eventual consistency" between xarray and actual direct map
> configuration by the time kvm_gmem_[folio_]set_direct_map exits.
> 
> [1]: https://lore.kernel.org/kvm/4b49248b-1cf1-44dc-9b50-ee551e1671ac@redhat.com/
> 
> Signed-off-by: Patrick Roy <roypat@amazon.co.uk>
> ---
>   virt/kvm/guest_memfd.c | 125 +++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 125 insertions(+)
> 
> diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> index 50ffc2ad73eda..54387828dcc6a 100644
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -96,6 +96,131 @@ static int kvm_gmem_folio_configure_direct_map(struct folio *folio)
>   	return r;
>   }
>   
> +/*
> + * Updates the range [@start, @end] in @gmem_priv's direct map state xarray to be @state,
> + * e.g. erasing entries in this range if @state is the default state, and creating
> + * entries otherwise.
> + *
> + * Assumes the xa_lock is held.
> + */
> +static int __kvm_gmem_update_xarray(struct kvm_gmem_inode_private *gmem_priv, pgoff_t start,
> +				    pgoff_t end, bool state)
> +{
> +	struct xarray *xa = &gmem_priv->direct_map_state;
> +	int r = 0;
> +
> +	/*
> +	 * Cannot use xa_store_range, as multi-indexes cannot easily
> +	 * be partially updated.
> +	 */
> +	for (pgoff_t index = start; index < end; ++index) {
> +		if (state == gmem_priv->default_direct_map_state)
> +			__xa_erase(xa, index);
> +		else
> +			/* don't care _what_ we store in the xarray, only care about presence */
> +			__xa_store(xa, index, gmem_priv, GFP_KERNEL);
> +
> +		r = xa_err(xa);
> +		if (r)
> +			goto out;
> +	}
> +
> +out:
> +	return r;
> +}
> +
> +static int __kvm_gmem_folio_set_direct_map(struct folio *folio, pgoff_t start, pgoff_t end,
> +					   bool state)
> +{
> +	unsigned long npages = end - start + 1;
> +	struct page *first_page = folio_file_page(folio, start);
> +
> +	int r = set_direct_map_valid_noflush(first_page, npages, state);
> +
> +	flush_tlb_kernel_range((unsigned long)page_address(first_page),
> +			       (unsigned long)page_address(first_page) +
> +				       npages * PAGE_SIZE);
> +	return r;
> +}
> +
> +/*
> + * Updates the direct map status for the given range from @start to @end (inclusive), returning
> + * -EINVAL if this range is not completely contained within @folio. Also updates the
> + * xarray stored in the private data of the inode @folio is attached to.
> + *
> + * Takes and drops the folio lock.
> + */
> +static __always_unused int kvm_gmem_folio_set_direct_map(struct folio *folio, pgoff_t start,
> +								 pgoff_t end, bool state)
> +{
> +	struct inode *inode = folio_inode(folio);
> +	struct kvm_gmem_inode_private *gmem_priv = inode->i_private;
> +	int r = -EINVAL;
> +
> +	if (!folio_contains(folio, start) || !folio_contains(folio, end))
> +		goto out;
> +
> +	xa_lock(&gmem_priv->direct_map_state);
> +	r = __kvm_gmem_update_xarray(gmem_priv, start, end, state);
> +	if (r)
> +		goto unlock_xa;
> +
> +	folio_lock(folio);
> +	r = __kvm_gmem_folio_set_direct_map(folio, start, end, state);
> +	folio_unlock(folio);
> +
> +unlock_xa:
> +	xa_unlock(&gmem_priv->direct_map_state);
> +out:
> +	return r;
> +}
> +
> +/*
> + * Updates the direct map status for the given range from @start to @end (inclusive)
> + * of @inode. Folios in this range have their direct map entries reconfigured,
> + * and the xarray in the @inode's private data is updated.
> + */
> +static __always_unused int kvm_gmem_set_direct_map(struct inode *inode, pgoff_t start,
> +							   pgoff_t end, bool state)
> +{
> +	struct kvm_gmem_inode_private *gmem_priv = inode->i_private;
> +	struct folio_batch fbatch;
> +	pgoff_t index = start;
> +	unsigned int count, i;
> +	int r = 0;
> +
> +	xa_lock(&gmem_priv->direct_map_state);
> +
> +	r = __kvm_gmem_update_xarray(gmem_priv, start, end, state);
> +	if (r)
> +		goto out;
> +
	if (r) {
		xa_unlock(&gmem_priv->direct_map_state);
		goto out;
	}

thanks,

Mike

> +	folio_batch_init(&fbatch);
> +	while (!filemap_get_folios(inode->i_mapping, &index, end, &fbatch) && !r) {
> +		count = folio_batch_count(&fbatch);
> +		for (i = 0; i < count; i++) {
> +			struct folio *folio = fbatch.folios[i];
> +			pgoff_t folio_start = max(folio_index(folio), start);
> +			pgoff_t folio_end =
> +				min(folio_index(folio) + folio_nr_pages(folio),
> +				    end);
> +
> +			folio_lock(folio);
> +			r = __kvm_gmem_folio_set_direct_map(folio, folio_start,
> +							    folio_end, state);
> +			folio_unlock(folio);
> +
> +			if (r)
> +				break;
> +		}
> +		folio_batch_release(&fbatch);
> +	}
> +
> +	xa_unlock(&gmem_priv->direct_map_state);
> +out:
> +	return r;
> +}
> +
>   /**
>    * folio_file_pfn - like folio_file_page, but return a pfn.
>    * @folio: The folio which contains this index.