[v9] arm64: Support for Arm CCA in KVM

[PATCH v9 19/43] arm64: RME: Allow populating initial contents

Posted by Steven Price 4 months ago

The VMM needs to populate the realm with some data before starting (e.g.
a kernel and initrd). This is measured by the RMM and used as part of
the attestation later on.

Co-developed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Signed-off-by: Steven Price <steven.price@arm.com>
Reviewed-by: Gavin Shan <gshan@redhat.com>
---
Changes since v7:
 * Improve the error codes.
 * Other minor changes from review.
Changes since v6:
 * Handle host potentially having a larger page size than the RMM
   granule.
 * Drop historic "par" (protected address range) from
   populate_par_region() - it doesn't exist within the current
   architecture.
 * Add a cond_resched() call in kvm_populate_realm().
Changes since v5:
 * Refactor to use PFNs rather than tracking struct page in
   realm_create_protected_data_page().
 * Pull changes from a later patch (in the v5 series) for accessing
   pages from a guest memfd.
 * Do the populate in chunks to avoid holding locks for too long and
   triggering RCU stall warnings.
---
 arch/arm64/kvm/rme.c | 227 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 227 insertions(+)

diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
index 678b14ba2466..d7bb11583506 100644
--- a/arch/arm64/kvm/rme.c
+++ b/arch/arm64/kvm/rme.c
@@ -660,6 +660,221 @@ void kvm_realm_unmap_range(struct kvm *kvm, unsigned long start,
 		realm_unmap_private_range(kvm, start, end, may_block);
 }
 
+static int realm_create_protected_data_granule(struct realm *realm,
+					       unsigned long ipa,
+					       phys_addr_t dst_phys,
+					       phys_addr_t src_phys,
+					       unsigned long flags)
+{
+	phys_addr_t rd = virt_to_phys(realm->rd);
+	int ret;
+
+	if (rmi_granule_delegate(dst_phys))
+		return -ENXIO;
+
+	ret = rmi_data_create(rd, dst_phys, ipa, src_phys, flags);
+	if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
+		/* Create missing RTTs and retry */
+		int level = RMI_RETURN_INDEX(ret);
+
+		WARN_ON(level == RMM_RTT_MAX_LEVEL);
+
+		ret = realm_create_rtt_levels(realm, ipa, level,
+					      RMM_RTT_MAX_LEVEL, NULL);
+		if (ret)
+			return -EIO;
+
+		ret = rmi_data_create(rd, dst_phys, ipa, src_phys, flags);
+	}
+	if (ret)
+		return -EIO;
+
+	return 0;
+}
+
+static int realm_create_protected_data_page(struct realm *realm,
+					    unsigned long ipa,
+					    kvm_pfn_t dst_pfn,
+					    kvm_pfn_t src_pfn,
+					    unsigned long flags)
+{
+	unsigned long rd = virt_to_phys(realm->rd);
+	phys_addr_t dst_phys, src_phys;
+	bool undelegate_failed = false;
+	int ret, offset;
+
+	dst_phys = __pfn_to_phys(dst_pfn);
+	src_phys = __pfn_to_phys(src_pfn);
+
+	for (offset = 0; offset < PAGE_SIZE; offset += RMM_PAGE_SIZE) {
+		ret = realm_create_protected_data_granule(realm,
+							  ipa,
+							  dst_phys,
+							  src_phys,
+							  flags);
+		if (ret)
+			goto err;
+
+		ipa += RMM_PAGE_SIZE;
+		dst_phys += RMM_PAGE_SIZE;
+		src_phys += RMM_PAGE_SIZE;
+	}
+
+	return 0;
+
+err:
+	if (ret == -EIO) {
+		/* current offset needs undelegating */
+		if (WARN_ON(rmi_granule_undelegate(dst_phys)))
+			undelegate_failed = true;
+	}
+	while (offset > 0) {
+		ipa -= RMM_PAGE_SIZE;
+		offset -= RMM_PAGE_SIZE;
+		dst_phys -= RMM_PAGE_SIZE;
+
+		rmi_data_destroy(rd, ipa, NULL, NULL);
+
+		if (WARN_ON(rmi_granule_undelegate(dst_phys)))
+			undelegate_failed = true;
+	}
+
+	if (undelegate_failed) {
+		/*
+		 * A granule could not be undelegated,
+		 * so the page has to be leaked
+		 */
+		get_page(pfn_to_page(dst_pfn));
+	}
+
+	return -ENXIO;
+}
+
+static int populate_region(struct kvm *kvm,
+			   phys_addr_t ipa_base,
+			   phys_addr_t ipa_end,
+			   unsigned long data_flags)
+{
+	struct realm *realm = &kvm->arch.realm;
+	struct kvm_memory_slot *memslot;
+	gfn_t base_gfn, end_gfn;
+	int idx;
+	phys_addr_t ipa = ipa_base;
+	int ret = 0;
+
+	base_gfn = gpa_to_gfn(ipa_base);
+	end_gfn = gpa_to_gfn(ipa_end);
+
+	idx = srcu_read_lock(&kvm->srcu);
+	memslot = gfn_to_memslot(kvm, base_gfn);
+	if (!memslot) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* We require the region to be contained within a single memslot */
+	if (memslot->base_gfn + memslot->npages < end_gfn) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!kvm_slot_can_be_private(memslot)) {
+		ret = -EPERM;
+		goto out;
+	}
+
+	while (ipa < ipa_end) {
+		struct vm_area_struct *vma;
+		unsigned long hva;
+		struct page *page;
+		bool writeable;
+		kvm_pfn_t pfn;
+		kvm_pfn_t priv_pfn;
+		struct page *gmem_page;
+
+		hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
+		vma = vma_lookup(current->mm, hva);
+		if (!vma) {
+			ret = -EFAULT;
+			break;
+		}
+
+		pfn = __kvm_faultin_pfn(memslot, gpa_to_gfn(ipa), FOLL_WRITE,
+					&writeable, &page);
+
+		if (is_error_pfn(pfn)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		ret = kvm_gmem_get_pfn(kvm, memslot,
+				       ipa >> PAGE_SHIFT,
+				       &priv_pfn, &gmem_page, NULL);
+		if (ret)
+			break;
+
+		ret = realm_create_protected_data_page(realm, ipa,
+						       priv_pfn,
+						       pfn,
+						       data_flags);
+
+		kvm_release_page_clean(page);
+
+		if (ret)
+			break;
+
+		ipa += PAGE_SIZE;
+	}
+
+out:
+	srcu_read_unlock(&kvm->srcu, idx);
+	return ret;
+}
+
+static int kvm_populate_realm(struct kvm *kvm,
+			      struct arm_rme_populate_realm *args)
+{
+	phys_addr_t ipa_base, ipa_end;
+	unsigned long data_flags = 0;
+
+	if (kvm_realm_state(kvm) != REALM_STATE_NEW)
+		return -EPERM;
+
+	if (!IS_ALIGNED(args->base, PAGE_SIZE) ||
+	    !IS_ALIGNED(args->size, PAGE_SIZE) ||
+	    (args->flags & ~RMI_MEASURE_CONTENT))
+		return -EINVAL;
+
+	ipa_base = args->base;
+	ipa_end = ipa_base + args->size;
+
+	if (ipa_end < ipa_base)
+		return -EINVAL;
+
+	if (args->flags & RMI_MEASURE_CONTENT)
+		data_flags |= RMI_MEASURE_CONTENT;
+
+	/*
+	 * Perform the population in parts to ensure locks are not held for too
+	 * long
+	 */
+	while (ipa_base < ipa_end) {
+		phys_addr_t end = min(ipa_end, ipa_base + SZ_2M);
+
+		int ret = populate_region(kvm, ipa_base, end,
+					  args->flags);
+
+		if (ret)
+			return ret;
+
+		ipa_base = end;
+
+		cond_resched();
+	}
+
+	return 0;
+}
+
 enum ripas_action {
 	RIPAS_INIT,
 	RIPAS_SET,
@@ -903,6 +1118,18 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		r = kvm_init_ipa_range_realm(kvm, &args);
 		break;
 	}
+	case KVM_CAP_ARM_RME_POPULATE_REALM: {
+		struct arm_rme_populate_realm args;
+		void __user *argp = u64_to_user_ptr(cap->args[1]);
+
+		if (copy_from_user(&args, argp, sizeof(args))) {
+			r = -EFAULT;
+			break;
+		}
+
+		r = kvm_populate_realm(kvm, &args);
+		break;
+	}
 	default:
 		r = -EINVAL;
 		break;
-- 
2.43.0

Re: [PATCH v9 19/43] arm64: RME: Allow populating initial contents

Posted by Vishal Annapurve 2 months, 1 week ago

On Wed, Jun 11, 2025 at 3:59 AM Steven Price <steven.price@arm.com> wrote:
>
> +static int realm_create_protected_data_page(struct realm *realm,
> +                                           unsigned long ipa,
> +                                           kvm_pfn_t dst_pfn,
> +                                           kvm_pfn_t src_pfn,
> +                                           unsigned long flags)
> +{
> +       unsigned long rd = virt_to_phys(realm->rd);
> +       phys_addr_t dst_phys, src_phys;
> +       bool undelegate_failed = false;
> +       int ret, offset;
> +
> +       dst_phys = __pfn_to_phys(dst_pfn);
> +       src_phys = __pfn_to_phys(src_pfn);
> +
> +       for (offset = 0; offset < PAGE_SIZE; offset += RMM_PAGE_SIZE) {
> +               ret = realm_create_protected_data_granule(realm,
> +                                                         ipa,
> +                                                         dst_phys,
> +                                                         src_phys,
> +                                                         flags);
> +               if (ret)
> +                       goto err;
> +
> +               ipa += RMM_PAGE_SIZE;
> +               dst_phys += RMM_PAGE_SIZE;
> +               src_phys += RMM_PAGE_SIZE;
> +       }
> +
> +       return 0;
> +
> +err:
> +       if (ret == -EIO) {
> +               /* current offset needs undelegating */
> +               if (WARN_ON(rmi_granule_undelegate(dst_phys)))
> +                       undelegate_failed = true;
> +       }
> +       while (offset > 0) {
> +               ipa -= RMM_PAGE_SIZE;
> +               offset -= RMM_PAGE_SIZE;
> +               dst_phys -= RMM_PAGE_SIZE;
> +
> +               rmi_data_destroy(rd, ipa, NULL, NULL);
> +
> +               if (WARN_ON(rmi_granule_undelegate(dst_phys)))
> +                       undelegate_failed = true;
> +       }
> +
> +       if (undelegate_failed) {
> +               /*
> +                * A granule could not be undelegated,
> +                * so the page has to be leaked
> +                */
> +               get_page(pfn_to_page(dst_pfn));

I would like to point out that the support for in-place conversion
with guest_memfd using hugetlb pages [1] is under discussion.

As part of the in-place conversion, the policy we are routing for is
to avoid any "refcounts" from KVM on folios supplied by guest_memfd as
in-place conversion works by splitting and merging folios during
memory conversion as per discussion at LPC [2].

The best way to avoid further use of this page with huge page support
around would be either:
1) Explicitly Inform guest_memfd of a particular pfn being in use by
KVM without relying on page refcounts or
2) Set the page as hwpoisoned. (Needs further discussion)

This page refcounting strategy will have to be revisited depending on
which series lands first. That being said, it would be great if ARM
could review/verify if the series [1] works for backing CCA VMs with
huge pages.

[1] https://lore.kernel.org/kvm/cover.1747264138.git.ackerleytng@google.com/
[2] https://lpc.events/event/18/contributions/1764/

> +       }
> +
> +       return -ENXIO;
> +}
> +
> +static int populate_region(struct kvm *kvm,
> +                          phys_addr_t ipa_base,
> +                          phys_addr_t ipa_end,
> +                          unsigned long data_flags)
> +{
> +       struct realm *realm = &kvm->arch.realm;
> +       struct kvm_memory_slot *memslot;
> +       gfn_t base_gfn, end_gfn;
> +       int idx;
> +       phys_addr_t ipa = ipa_base;
> +       int ret = 0;
> +
> +       base_gfn = gpa_to_gfn(ipa_base);
> +       end_gfn = gpa_to_gfn(ipa_end);
> +
> +       idx = srcu_read_lock(&kvm->srcu);
> +       memslot = gfn_to_memslot(kvm, base_gfn);
> +       if (!memslot) {
> +               ret = -EFAULT;
> +               goto out;
> +       }
> +
> +       /* We require the region to be contained within a single memslot */
> +       if (memslot->base_gfn + memslot->npages < end_gfn) {
> +               ret = -EINVAL;
> +               goto out;
> +       }
> +
> +       if (!kvm_slot_can_be_private(memslot)) {
> +               ret = -EPERM;
> +               goto out;
> +       }
> +
> +       while (ipa < ipa_end) {
> +               struct vm_area_struct *vma;
> +               unsigned long hva;
> +               struct page *page;
> +               bool writeable;
> +               kvm_pfn_t pfn;
> +               kvm_pfn_t priv_pfn;
> +               struct page *gmem_page;
> +
> +               hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
> +               vma = vma_lookup(current->mm, hva);
> +               if (!vma) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +
> +               pfn = __kvm_faultin_pfn(memslot, gpa_to_gfn(ipa), FOLL_WRITE,
> +                                       &writeable, &page);

Is this assuming double backing of guest memory ranges? Is this logic
trying to simulate a shared fault?

Does memory population work with CCA if priv_pfn and pfn are the same?
I am curious how the memory population will work with in-place
conversion support available for guest_memfd files.

> +
> +               if (is_error_pfn(pfn)) {
> +                       ret = -EFAULT;
> +                       break;
> +               }
> +
> +               ret = kvm_gmem_get_pfn(kvm, memslot,
> +                                      ipa >> PAGE_SHIFT,
> +                                      &priv_pfn, &gmem_page, NULL);
> +               if (ret)
> +                       break;
> +
> +               ret = realm_create_protected_data_page(realm, ipa,
> +                                                      priv_pfn,
> +                                                      pfn,
> +                                                      data_flags);
> +
> +               kvm_release_page_clean(page);
> +
> +               if (ret)
> +                       break;
> +
> +               ipa += PAGE_SIZE;
> +       }
> +
> +out:
> +       srcu_read_unlock(&kvm->srcu, idx);
> +       return ret;
> +}
> +

Re: [PATCH v9 19/43] arm64: RME: Allow populating initial contents

Posted by Steven Price 1 month, 4 weeks ago

On 01/08/2025 02:56, Vishal Annapurve wrote:
> On Wed, Jun 11, 2025 at 3:59 AM Steven Price <steven.price@arm.com> wrote:
>>
>> +static int realm_create_protected_data_page(struct realm *realm,
>> +                                           unsigned long ipa,
>> +                                           kvm_pfn_t dst_pfn,
>> +                                           kvm_pfn_t src_pfn,
>> +                                           unsigned long flags)
>> +{
>> +       unsigned long rd = virt_to_phys(realm->rd);
>> +       phys_addr_t dst_phys, src_phys;
>> +       bool undelegate_failed = false;
>> +       int ret, offset;
>> +
>> +       dst_phys = __pfn_to_phys(dst_pfn);
>> +       src_phys = __pfn_to_phys(src_pfn);
>> +
>> +       for (offset = 0; offset < PAGE_SIZE; offset += RMM_PAGE_SIZE) {
>> +               ret = realm_create_protected_data_granule(realm,
>> +                                                         ipa,
>> +                                                         dst_phys,
>> +                                                         src_phys,
>> +                                                         flags);
>> +               if (ret)
>> +                       goto err;
>> +
>> +               ipa += RMM_PAGE_SIZE;
>> +               dst_phys += RMM_PAGE_SIZE;
>> +               src_phys += RMM_PAGE_SIZE;
>> +       }
>> +
>> +       return 0;
>> +
>> +err:
>> +       if (ret == -EIO) {
>> +               /* current offset needs undelegating */
>> +               if (WARN_ON(rmi_granule_undelegate(dst_phys)))
>> +                       undelegate_failed = true;
>> +       }
>> +       while (offset > 0) {
>> +               ipa -= RMM_PAGE_SIZE;
>> +               offset -= RMM_PAGE_SIZE;
>> +               dst_phys -= RMM_PAGE_SIZE;
>> +
>> +               rmi_data_destroy(rd, ipa, NULL, NULL);
>> +
>> +               if (WARN_ON(rmi_granule_undelegate(dst_phys)))
>> +                       undelegate_failed = true;
>> +       }
>> +
>> +       if (undelegate_failed) {
>> +               /*
>> +                * A granule could not be undelegated,
>> +                * so the page has to be leaked
>> +                */
>> +               get_page(pfn_to_page(dst_pfn));
> 
> I would like to point out that the support for in-place conversion
> with guest_memfd using hugetlb pages [1] is under discussion.
> 
> As part of the in-place conversion, the policy we are routing for is
> to avoid any "refcounts" from KVM on folios supplied by guest_memfd as
> in-place conversion works by splitting and merging folios during
> memory conversion as per discussion at LPC [2].

CCA doesn't really support "in-place" conversions (see more detail
below). But here the issue is that something has gone wrong and the RMM
is refusing to give us a page back.

> 
> The best way to avoid further use of this page with huge page support
> around would be either:
> 1) Explicitly Inform guest_memfd of a particular pfn being in use by
> KVM without relying on page refcounts or

This might work, but note that the page is unavailable even after user
space has freed the guest_memfd. So at some point the page needs to be
marked so that it cannot be reallocated by the kernel. Holding a
refcount isn't ideal but I haven't come up with a better idea.

Note that this is a "should never happen" situation - the code will have
WARN()ed already - so this is just a best effort to allow the system to
limp on.

> 2) Set the page as hwpoisoned. (Needs further discussion)

This certainly sounds like a closer fit - but I'm not very familiar with
hwpoison so I don't know how easy it would be to integrate with this.

> This page refcounting strategy will have to be revisited depending on
> which series lands first. That being said, it would be great if ARM
> could review/verify if the series [1] works for backing CCA VMs with
> huge pages.
> 
> [1] https://lore.kernel.org/kvm/cover.1747264138.git.ackerleytng@google.com/
> [2] https://lpc.events/event/18/contributions/1764/
> 
>> +       }
>> +
>> +       return -ENXIO;
>> +}
>> +
>> +static int populate_region(struct kvm *kvm,
>> +                          phys_addr_t ipa_base,
>> +                          phys_addr_t ipa_end,
>> +                          unsigned long data_flags)
>> +{
>> +       struct realm *realm = &kvm->arch.realm;
>> +       struct kvm_memory_slot *memslot;
>> +       gfn_t base_gfn, end_gfn;
>> +       int idx;
>> +       phys_addr_t ipa = ipa_base;
>> +       int ret = 0;
>> +
>> +       base_gfn = gpa_to_gfn(ipa_base);
>> +       end_gfn = gpa_to_gfn(ipa_end);
>> +
>> +       idx = srcu_read_lock(&kvm->srcu);
>> +       memslot = gfn_to_memslot(kvm, base_gfn);
>> +       if (!memslot) {
>> +               ret = -EFAULT;
>> +               goto out;
>> +       }
>> +
>> +       /* We require the region to be contained within a single memslot */
>> +       if (memslot->base_gfn + memslot->npages < end_gfn) {
>> +               ret = -EINVAL;
>> +               goto out;
>> +       }
>> +
>> +       if (!kvm_slot_can_be_private(memslot)) {
>> +               ret = -EPERM;
>> +               goto out;
>> +       }
>> +
>> +       while (ipa < ipa_end) {
>> +               struct vm_area_struct *vma;
>> +               unsigned long hva;
>> +               struct page *page;
>> +               bool writeable;
>> +               kvm_pfn_t pfn;
>> +               kvm_pfn_t priv_pfn;
>> +               struct page *gmem_page;
>> +
>> +               hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
>> +               vma = vma_lookup(current->mm, hva);
>> +               if (!vma) {
>> +                       ret = -EFAULT;
>> +                       break;
>> +               }
>> +
>> +               pfn = __kvm_faultin_pfn(memslot, gpa_to_gfn(ipa), FOLL_WRITE,
>> +                                       &writeable, &page);
> 
> Is this assuming double backing of guest memory ranges? Is this logic
> trying to simulate a shared fault?

Yes and yes...

> Does memory population work with CCA if priv_pfn and pfn are the same?
> I am curious how the memory population will work with in-place
> conversion support available for guest_memfd files.

The RMM interface doesn't support an in-place conversion. The
RMI_DATA_CREATE operation takes the PA of the already delegated
granule[1] along with the PA of a non-delegated granule with the data.

So to mimic an in-place conversion requires copying the data from the
page, delegating the (original) page and then using RMI_DATA_CREATE
which copies the data back. Fundamentally because there may be memory
encryption involved there is going to be a requirement for this double
memcpy() approach. Note that this is only relevant during the initial
setup phase - CCA doesn't (at least currently) permit populated pages to
be provided to the guest when it is running.

The approach this series takes pre-dates the guest_memfd discussions and
so is assuming that the shared memory is not (directly) provided by the
guest_memfd but is using the user space pointer provided in the memslot.
It would be possible (with the patches proposed) for the VMM to mmap()
the guest_memfd when the memory is being shared so as to reuse the
physical pages.

I do also plan to look at supporting the use of the guest_memfd for the
shared memory directly. But I've been waiting for the discussions to
conclude before attempting to implement that.

[1] A 'granule' is the RMM's idea of a page size (RMM_PAGE_SIZE), which
is currently (RMM v1.0) always 4k. So may be different when Linux is
running with a larger page size.

Thanks,
Steve

>> +
>> +               if (is_error_pfn(pfn)) {
>> +                       ret = -EFAULT;
>> +                       break;
>> +               }
>> +
>> +               ret = kvm_gmem_get_pfn(kvm, memslot,
>> +                                      ipa >> PAGE_SHIFT,
>> +                                      &priv_pfn, &gmem_page, NULL);
>> +               if (ret)
>> +                       break;
>> +
>> +               ret = realm_create_protected_data_page(realm, ipa,
>> +                                                      priv_pfn,
>> +                                                      pfn,
>> +                                                      data_flags);
>> +
>> +               kvm_release_page_clean(page);
>> +
>> +               if (ret)
>> +                       break;
>> +
>> +               ipa += PAGE_SIZE;
>> +       }
>> +
>> +out:
>> +       srcu_read_unlock(&kvm->srcu, idx);
>> +       return ret;
>> +}
>> +

Re: [PATCH v9 19/43] arm64: RME: Allow populating initial contents

Posted by Vishal Annapurve 1 month, 3 weeks ago

On Wed, Aug 13, 2025 at 2:30 AM Steven Price <steven.price@arm.com> wrote:
>
> On 01/08/2025 02:56, Vishal Annapurve wrote:
> > On Wed, Jun 11, 2025 at 3:59 AM Steven Price <steven.price@arm.com> wrote:
> >>
> >> +static int realm_create_protected_data_page(struct realm *realm,
> >> +                                           unsigned long ipa,
> >> +                                           kvm_pfn_t dst_pfn,
> >> +                                           kvm_pfn_t src_pfn,
> >> +                                           unsigned long flags)
> >> +{
> >> +       unsigned long rd = virt_to_phys(realm->rd);
> >> +       phys_addr_t dst_phys, src_phys;
> >> +       bool undelegate_failed = false;
> >> +       int ret, offset;
> >> +
> >> +       dst_phys = __pfn_to_phys(dst_pfn);
> >> +       src_phys = __pfn_to_phys(src_pfn);
> >> +
> >> +       for (offset = 0; offset < PAGE_SIZE; offset += RMM_PAGE_SIZE) {
> >> +               ret = realm_create_protected_data_granule(realm,
> >> +                                                         ipa,
> >> +                                                         dst_phys,
> >> +                                                         src_phys,
> >> +                                                         flags);
> >> +               if (ret)
> >> +                       goto err;
> >> +
> >> +               ipa += RMM_PAGE_SIZE;
> >> +               dst_phys += RMM_PAGE_SIZE;
> >> +               src_phys += RMM_PAGE_SIZE;
> >> +       }
> >> +
> >> +       return 0;
> >> +
> >> +err:
> >> +       if (ret == -EIO) {
> >> +               /* current offset needs undelegating */
> >> +               if (WARN_ON(rmi_granule_undelegate(dst_phys)))
> >> +                       undelegate_failed = true;
> >> +       }
> >> +       while (offset > 0) {
> >> +               ipa -= RMM_PAGE_SIZE;
> >> +               offset -= RMM_PAGE_SIZE;
> >> +               dst_phys -= RMM_PAGE_SIZE;
> >> +
> >> +               rmi_data_destroy(rd, ipa, NULL, NULL);
> >> +
> >> +               if (WARN_ON(rmi_granule_undelegate(dst_phys)))
> >> +                       undelegate_failed = true;
> >> +       }
> >> +
> >> +       if (undelegate_failed) {
> >> +               /*
> >> +                * A granule could not be undelegated,
> >> +                * so the page has to be leaked
> >> +                */
> >> +               get_page(pfn_to_page(dst_pfn));
> >
> > I would like to point out that the support for in-place conversion
> > with guest_memfd using hugetlb pages [1] is under discussion.
> >
> > As part of the in-place conversion, the policy we are routing for is
> > to avoid any "refcounts" from KVM on folios supplied by guest_memfd as
> > in-place conversion works by splitting and merging folios during
> > memory conversion as per discussion at LPC [2].
>
> CCA doesn't really support "in-place" conversions (see more detail
> below). But here the issue is that something has gone wrong and the RMM
> is refusing to give us a page back.

I think I overloaded the term "in-place" conversion in this context. I
was talking about supporting "in-place" conversion without data
preservation. i.e. Host will use the same GPA->HPA range mapping even
after conversions, ensuring single backing for guest memory. This is
achieved by guest_memfd keeping track of private/shared ranges based
on userspace IOCTLs to change the tracking metadata.

>
> >
> > The best way to avoid further use of this page with huge page support
> > around would be either:
> > 1) Explicitly Inform guest_memfd of a particular pfn being in use by
> > KVM without relying on page refcounts or
>
> This might work, but note that the page is unavailable even after user
> space has freed the guest_memfd. So at some point the page needs to be
> marked so that it cannot be reallocated by the kernel. Holding a
> refcount isn't ideal but I haven't come up with a better idea.
>
> Note that this is a "should never happen" situation - the code will have
> WARN()ed already - so this is just a best effort to allow the system to
> limp on.
>
> > 2) Set the page as hwpoisoned. (Needs further discussion)
>
> This certainly sounds like a closer fit - but I'm not very familiar with
> hwpoison so I don't know how easy it would be to integrate with this.
>

We had similar discussions with Intel specific SEPT management and the
conclusion there was to just not hold refcounts and give a warning on
such failures [1].

[1] https://lore.kernel.org/kvm/20250807094241.4523-1-yan.y.zhao@intel.com/

> > This page refcounting strategy will have to be revisited depending on
> > which series lands first. That being said, it would be great if ARM
> > could review/verify if the series [1] works for backing CCA VMs with
> > huge pages.
> >
> > [1] https://lore.kernel.org/kvm/cover.1747264138.git.ackerleytng@google.com/
> > [2] https://lpc.events/event/18/contributions/1764/
> >
> >> +       }
> >> +
> >> +       return -ENXIO;
> >> +}
> >> +
> >> +static int populate_region(struct kvm *kvm,
> >> +                          phys_addr_t ipa_base,
> >> +                          phys_addr_t ipa_end,
> >> +                          unsigned long data_flags)
> >> +{
> >> +       struct realm *realm = &kvm->arch.realm;
> >> +       struct kvm_memory_slot *memslot;
> >> +       gfn_t base_gfn, end_gfn;
> >> +       int idx;
> >> +       phys_addr_t ipa = ipa_base;
> >> +       int ret = 0;
> >> +
> >> +       base_gfn = gpa_to_gfn(ipa_base);
> >> +       end_gfn = gpa_to_gfn(ipa_end);
> >> +
> >> +       idx = srcu_read_lock(&kvm->srcu);
> >> +       memslot = gfn_to_memslot(kvm, base_gfn);
> >> +       if (!memslot) {
> >> +               ret = -EFAULT;
> >> +               goto out;
> >> +       }
> >> +
> >> +       /* We require the region to be contained within a single memslot */
> >> +       if (memslot->base_gfn + memslot->npages < end_gfn) {
> >> +               ret = -EINVAL;
> >> +               goto out;
> >> +       }
> >> +
> >> +       if (!kvm_slot_can_be_private(memslot)) {
> >> +               ret = -EPERM;
> >> +               goto out;
> >> +       }
> >> +
> >> +       while (ipa < ipa_end) {
> >> +               struct vm_area_struct *vma;
> >> +               unsigned long hva;
> >> +               struct page *page;
> >> +               bool writeable;
> >> +               kvm_pfn_t pfn;
> >> +               kvm_pfn_t priv_pfn;
> >> +               struct page *gmem_page;
> >> +
> >> +               hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
> >> +               vma = vma_lookup(current->mm, hva);
> >> +               if (!vma) {
> >> +                       ret = -EFAULT;
> >> +                       break;
> >> +               }
> >> +
> >> +               pfn = __kvm_faultin_pfn(memslot, gpa_to_gfn(ipa), FOLL_WRITE,
> >> +                                       &writeable, &page);
> >
> > Is this assuming double backing of guest memory ranges? Is this logic
> > trying to simulate a shared fault?
>
> Yes and yes...
>
> > Does memory population work with CCA if priv_pfn and pfn are the same?
> > I am curious how the memory population will work with in-place
> > conversion support available for guest_memfd files.
>
> The RMM interface doesn't support an in-place conversion. The
> RMI_DATA_CREATE operation takes the PA of the already delegated
> granule[1] along with the PA of a non-delegated granule with the data.

Ok, I think this will need a source buffer from userspace that is
outside guest_memfd once guest_memfd will support a single backing for
guest memory. You might want to simulate private access fault for
destination GPAs backed by guest_memfd ranges for this initial data
population -> similar to how memory population works today with TDX
VMs.

Note that with single backing around, at least for x86, KVM
shared/private stage2 faults will always be served using guest_memfd
as the source of truth (i.e. not via userspace pagetables for shared
faults).

>
> So to mimic an in-place conversion requires copying the data from the
> page, delegating the (original) page and then using RMI_DATA_CREATE
> which copies the data back. Fundamentally because there may be memory
> encryption involved there is going to be a requirement for this double
> memcpy() approach. Note that this is only relevant during the initial
> setup phase - CCA doesn't (at least currently) permit populated pages to
> be provided to the guest when it is running.
>
> The approach this series takes pre-dates the guest_memfd discussions and
> so is assuming that the shared memory is not (directly) provided by the
> guest_memfd but is using the user space pointer provided in the memslot.
> It would be possible (with the patches proposed) for the VMM to mmap()
> the guest_memfd when the memory is being shared so as to reuse the
> physical pages.
>
> I do also plan to look at supporting the use of the guest_memfd for the
> shared memory directly. But I've been waiting for the discussions to
> conclude before attempting to implement that.
>
> [1] A 'granule' is the RMM's idea of a page size (RMM_PAGE_SIZE), which
> is currently (RMM v1.0) always 4k. So may be different when Linux is
> running with a larger page size.
>
> Thanks,
> Steve
>

Re: [PATCH v9 19/43] arm64: RME: Allow populating initial contents

Posted by Steven Price 1 month, 3 weeks ago

On 14/08/2025 17:26, Vishal Annapurve wrote:
> On Wed, Aug 13, 2025 at 2:30 AM Steven Price <steven.price@arm.com> wrote:
>>
>> On 01/08/2025 02:56, Vishal Annapurve wrote:
>>> On Wed, Jun 11, 2025 at 3:59 AM Steven Price <steven.price@arm.com> wrote:
>>>>
>>>> +static int realm_create_protected_data_page(struct realm *realm,
>>>> +                                           unsigned long ipa,
>>>> +                                           kvm_pfn_t dst_pfn,
>>>> +                                           kvm_pfn_t src_pfn,
>>>> +                                           unsigned long flags)
>>>> +{
>>>> +       unsigned long rd = virt_to_phys(realm->rd);
>>>> +       phys_addr_t dst_phys, src_phys;
>>>> +       bool undelegate_failed = false;
>>>> +       int ret, offset;
>>>> +
>>>> +       dst_phys = __pfn_to_phys(dst_pfn);
>>>> +       src_phys = __pfn_to_phys(src_pfn);
>>>> +
>>>> +       for (offset = 0; offset < PAGE_SIZE; offset += RMM_PAGE_SIZE) {
>>>> +               ret = realm_create_protected_data_granule(realm,
>>>> +                                                         ipa,
>>>> +                                                         dst_phys,
>>>> +                                                         src_phys,
>>>> +                                                         flags);
>>>> +               if (ret)
>>>> +                       goto err;
>>>> +
>>>> +               ipa += RMM_PAGE_SIZE;
>>>> +               dst_phys += RMM_PAGE_SIZE;
>>>> +               src_phys += RMM_PAGE_SIZE;
>>>> +       }
>>>> +
>>>> +       return 0;
>>>> +
>>>> +err:
>>>> +       if (ret == -EIO) {
>>>> +               /* current offset needs undelegating */
>>>> +               if (WARN_ON(rmi_granule_undelegate(dst_phys)))
>>>> +                       undelegate_failed = true;
>>>> +       }
>>>> +       while (offset > 0) {
>>>> +               ipa -= RMM_PAGE_SIZE;
>>>> +               offset -= RMM_PAGE_SIZE;
>>>> +               dst_phys -= RMM_PAGE_SIZE;
>>>> +
>>>> +               rmi_data_destroy(rd, ipa, NULL, NULL);
>>>> +
>>>> +               if (WARN_ON(rmi_granule_undelegate(dst_phys)))
>>>> +                       undelegate_failed = true;
>>>> +       }
>>>> +
>>>> +       if (undelegate_failed) {
>>>> +               /*
>>>> +                * A granule could not be undelegated,
>>>> +                * so the page has to be leaked
>>>> +                */
>>>> +               get_page(pfn_to_page(dst_pfn));
>>>
>>> I would like to point out that the support for in-place conversion
>>> with guest_memfd using hugetlb pages [1] is under discussion.
>>>
>>> As part of the in-place conversion, the policy we are routing for is
>>> to avoid any "refcounts" from KVM on folios supplied by guest_memfd as
>>> in-place conversion works by splitting and merging folios during
>>> memory conversion as per discussion at LPC [2].
>>
>> CCA doesn't really support "in-place" conversions (see more detail
>> below). But here the issue is that something has gone wrong and the RMM
>> is refusing to give us a page back.
> 
> I think I overloaded the term "in-place" conversion in this context. I
> was talking about supporting "in-place" conversion without data
> preservation. i.e. Host will use the same GPA->HPA range mapping even
> after conversions, ensuring single backing for guest memory. This is
> achieved by guest_memfd keeping track of private/shared ranges based
> on userspace IOCTLs to change the tracking metadata.

Yes, so for a destructive conversion this is fine. We can remove the
page from the protected region and then place the same physical page in
the shared region (or vice versa).

Population is a special case because it's effectively non-destructive,
and in that case we need both the reference data and the final
(protected) physical page both available at same time.

>>
>>>
>>> The best way to avoid further use of this page with huge page support
>>> around would be either:
>>> 1) Explicitly Inform guest_memfd of a particular pfn being in use by
>>> KVM without relying on page refcounts or
>>
>> This might work, but note that the page is unavailable even after user
>> space has freed the guest_memfd. So at some point the page needs to be
>> marked so that it cannot be reallocated by the kernel. Holding a
>> refcount isn't ideal but I haven't come up with a better idea.
>>
>> Note that this is a "should never happen" situation - the code will have
>> WARN()ed already - so this is just a best effort to allow the system to
>> limp on.
>>
>>> 2) Set the page as hwpoisoned. (Needs further discussion)
>>
>> This certainly sounds like a closer fit - but I'm not very familiar with
>> hwpoison so I don't know how easy it would be to integrate with this.
>>
> 
> We had similar discussions with Intel specific SEPT management and the
> conclusion there was to just not hold refcounts and give a warning on
> such failures [1].
> 
> [1] https://lore.kernel.org/kvm/20250807094241.4523-1-yan.y.zhao@intel.com/

So these paths (should) all warn already. I guess the question is
whether we want the platform to limp on in these situations or not.
Certainly converting the WARNs to BUG_ON would be very easy, but the
intention here was to give the user some chance to save their work
before killing the system.

Just WARNing might be ok, but if the kernel allocates the page for one
of it's data structures then it's effectively a BUG_ON - there's no
reasonable recovery. Reallocation into user space we can sort of handle,
but only by killing the process.

>>> This page refcounting strategy will have to be revisited depending on
>>> which series lands first. That being said, it would be great if ARM
>>> could review/verify if the series [1] works for backing CCA VMs with
>>> huge pages.
>>>
>>> [1] https://lore.kernel.org/kvm/cover.1747264138.git.ackerleytng@google.com/
>>> [2] https://lpc.events/event/18/contributions/1764/
>>>
>>>> +       }
>>>> +
>>>> +       return -ENXIO;
>>>> +}
>>>> +
>>>> +static int populate_region(struct kvm *kvm,
>>>> +                          phys_addr_t ipa_base,
>>>> +                          phys_addr_t ipa_end,
>>>> +                          unsigned long data_flags)
>>>> +{
>>>> +       struct realm *realm = &kvm->arch.realm;
>>>> +       struct kvm_memory_slot *memslot;
>>>> +       gfn_t base_gfn, end_gfn;
>>>> +       int idx;
>>>> +       phys_addr_t ipa = ipa_base;
>>>> +       int ret = 0;
>>>> +
>>>> +       base_gfn = gpa_to_gfn(ipa_base);
>>>> +       end_gfn = gpa_to_gfn(ipa_end);
>>>> +
>>>> +       idx = srcu_read_lock(&kvm->srcu);
>>>> +       memslot = gfn_to_memslot(kvm, base_gfn);
>>>> +       if (!memslot) {
>>>> +               ret = -EFAULT;
>>>> +               goto out;
>>>> +       }
>>>> +
>>>> +       /* We require the region to be contained within a single memslot */
>>>> +       if (memslot->base_gfn + memslot->npages < end_gfn) {
>>>> +               ret = -EINVAL;
>>>> +               goto out;
>>>> +       }
>>>> +
>>>> +       if (!kvm_slot_can_be_private(memslot)) {
>>>> +               ret = -EPERM;
>>>> +               goto out;
>>>> +       }
>>>> +
>>>> +       while (ipa < ipa_end) {
>>>> +               struct vm_area_struct *vma;
>>>> +               unsigned long hva;
>>>> +               struct page *page;
>>>> +               bool writeable;
>>>> +               kvm_pfn_t pfn;
>>>> +               kvm_pfn_t priv_pfn;
>>>> +               struct page *gmem_page;
>>>> +
>>>> +               hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
>>>> +               vma = vma_lookup(current->mm, hva);
>>>> +               if (!vma) {
>>>> +                       ret = -EFAULT;
>>>> +                       break;
>>>> +               }
>>>> +
>>>> +               pfn = __kvm_faultin_pfn(memslot, gpa_to_gfn(ipa), FOLL_WRITE,
>>>> +                                       &writeable, &page);
>>>
>>> Is this assuming double backing of guest memory ranges? Is this logic
>>> trying to simulate a shared fault?
>>
>> Yes and yes...
>>
>>> Does memory population work with CCA if priv_pfn and pfn are the same?
>>> I am curious how the memory population will work with in-place
>>> conversion support available for guest_memfd files.
>>
>> The RMM interface doesn't support an in-place conversion. The
>> RMI_DATA_CREATE operation takes the PA of the already delegated
>> granule[1] along with the PA of a non-delegated granule with the data.
> 
> Ok, I think this will need a source buffer from userspace that is
> outside guest_memfd once guest_memfd will support a single backing for
> guest memory. You might want to simulate private access fault for
> destination GPAs backed by guest_memfd ranges for this initial data
> population -> similar to how memory population works today with TDX
> VMs.

Yes, that might well be the best option. At the moment I'm not sure what
the best approach from the perspective of the VMM is. The current setup
is nice because the VMM can populate the VM just like a normal non-coco
setup, so we don't need to special-case anything. Requiring the VMM to
load the initial contents into some other memory for the purpose of the
populate call is a little ugly.

The other option is to just return to the double-memcpy() approach of
emulating an in-place content-preserving populate by the kernel copying
the data out of guest_memfd into a temporary page before doing the
conversion and the RMM copying back from the temporary page. I worry
about the performance of this, although it doesn't prevent the first
option being added in the future if necessary. The big benefit is that
it goes back to looking like a non-coco VM (but using guest_memfd).

> Note that with single backing around, at least for x86, KVM
> shared/private stage2 faults will always be served using guest_memfd
> as the source of truth (i.e. not via userspace pagetables for shared
> faults).

Indeed, it's not yet clear to me whether we can only support this
"single backing" world for CCA, or if it's going to be an option
alongside using guest_memfd only for protected memory. Obviously we need
the guest_memfd changes to land first too...

At the moment I'm planning to post a v10 of this series soon, keeping
the same API. But we will definitely want to support the new guest_memfd
approach when it's ready.

Thanks,
Steve

>>
>> So to mimic an in-place conversion requires copying the data from the
>> page, delegating the (original) page and then using RMI_DATA_CREATE
>> which copies the data back. Fundamentally because there may be memory
>> encryption involved there is going to be a requirement for this double
>> memcpy() approach. Note that this is only relevant during the initial
>> setup phase - CCA doesn't (at least currently) permit populated pages to
>> be provided to the guest when it is running.
>>
>> The approach this series takes pre-dates the guest_memfd discussions and
>> so is assuming that the shared memory is not (directly) provided by the
>> guest_memfd but is using the user space pointer provided in the memslot.
>> It would be possible (with the patches proposed) for the VMM to mmap()
>> the guest_memfd when the memory is being shared so as to reuse the
>> physical pages.
>>
>> I do also plan to look at supporting the use of the guest_memfd for the
>> shared memory directly. But I've been waiting for the discussions to
>> conclude before attempting to implement that.
>>
>> [1] A 'granule' is the RMM's idea of a page size (RMM_PAGE_SIZE), which
>> is currently (RMM v1.0) always 4k. So may be different when Linux is
>> running with a larger page size.
>>
>> Thanks,
>> Steve
>>

Re: [PATCH v9 19/43] arm64: RME: Allow populating initial contents

Posted by Vishal Annapurve 1 month, 3 weeks ago

On Fri, Aug 15, 2025 at 8:48 AM Steven Price <steven.price@arm.com> wrote:
>
> On 14/08/2025 17:26, Vishal Annapurve wrote:
> > On Wed, Aug 13, 2025 at 2:30 AM Steven Price <steven.price@arm.com> wrote:
> >>
> >> On 01/08/2025 02:56, Vishal Annapurve wrote:
> >>> On Wed, Jun 11, 2025 at 3:59 AM Steven Price <steven.price@arm.com> wrote:
> >>>>
> >>>> +static int realm_create_protected_data_page(struct realm *realm,
> >>>> +                                           unsigned long ipa,
> >>>> +                                           kvm_pfn_t dst_pfn,
> >>>> +                                           kvm_pfn_t src_pfn,
> >>>> +                                           unsigned long flags)
> >>>> +{
> >>>> +       unsigned long rd = virt_to_phys(realm->rd);
> >>>> +       phys_addr_t dst_phys, src_phys;
> >>>> +       bool undelegate_failed = false;
> >>>> +       int ret, offset;
> >>>> +
> >>>> +       dst_phys = __pfn_to_phys(dst_pfn);
> >>>> +       src_phys = __pfn_to_phys(src_pfn);
> >>>> +
> >>>> +       for (offset = 0; offset < PAGE_SIZE; offset += RMM_PAGE_SIZE) {
> >>>> +               ret = realm_create_protected_data_granule(realm,
> >>>> +                                                         ipa,
> >>>> +                                                         dst_phys,
> >>>> +                                                         src_phys,
> >>>> +                                                         flags);
> >>>> +               if (ret)
> >>>> +                       goto err;
> >>>> +
> >>>> +               ipa += RMM_PAGE_SIZE;
> >>>> +               dst_phys += RMM_PAGE_SIZE;
> >>>> +               src_phys += RMM_PAGE_SIZE;
> >>>> +       }
> >>>> +
> >>>> +       return 0;
> >>>> +
> >>>> +err:
> >>>> +       if (ret == -EIO) {
> >>>> +               /* current offset needs undelegating */
> >>>> +               if (WARN_ON(rmi_granule_undelegate(dst_phys)))
> >>>> +                       undelegate_failed = true;
> >>>> +       }
> >>>> +       while (offset > 0) {
> >>>> +               ipa -= RMM_PAGE_SIZE;
> >>>> +               offset -= RMM_PAGE_SIZE;
> >>>> +               dst_phys -= RMM_PAGE_SIZE;
> >>>> +
> >>>> +               rmi_data_destroy(rd, ipa, NULL, NULL);
> >>>> +
> >>>> +               if (WARN_ON(rmi_granule_undelegate(dst_phys)))
> >>>> +                       undelegate_failed = true;
> >>>> +       }
> >>>> +
> >>>> +       if (undelegate_failed) {
> >>>> +               /*
> >>>> +                * A granule could not be undelegated,
> >>>> +                * so the page has to be leaked
> >>>> +                */
> >>>> +               get_page(pfn_to_page(dst_pfn));
> >>>
> >>> I would like to point out that the support for in-place conversion
> >>> with guest_memfd using hugetlb pages [1] is under discussion.
> >>>
> >>> As part of the in-place conversion, the policy we are routing for is
> >>> to avoid any "refcounts" from KVM on folios supplied by guest_memfd as
> >>> in-place conversion works by splitting and merging folios during
> >>> memory conversion as per discussion at LPC [2].
> >>
> >> CCA doesn't really support "in-place" conversions (see more detail
> >> below). But here the issue is that something has gone wrong and the RMM
> >> is refusing to give us a page back.
> >
> > I think I overloaded the term "in-place" conversion in this context. I
> > was talking about supporting "in-place" conversion without data
> > preservation. i.e. Host will use the same GPA->HPA range mapping even
> > after conversions, ensuring single backing for guest memory. This is
> > achieved by guest_memfd keeping track of private/shared ranges based
> > on userspace IOCTLs to change the tracking metadata.
>
> Yes, so for a destructive conversion this is fine. We can remove the
> page from the protected region and then place the same physical page in
> the shared region (or vice versa).
>
> Population is a special case because it's effectively non-destructive,
> and in that case we need both the reference data and the final
> (protected) physical page both available at same time.
>
> >>
> >>>
> >>> The best way to avoid further use of this page with huge page support
> >>> around would be either:
> >>> 1) Explicitly Inform guest_memfd of a particular pfn being in use by
> >>> KVM without relying on page refcounts or
> >>
> >> This might work, but note that the page is unavailable even after user
> >> space has freed the guest_memfd. So at some point the page needs to be
> >> marked so that it cannot be reallocated by the kernel. Holding a
> >> refcount isn't ideal but I haven't come up with a better idea.
> >>
> >> Note that this is a "should never happen" situation - the code will have
> >> WARN()ed already - so this is just a best effort to allow the system to
> >> limp on.
> >>
> >>> 2) Set the page as hwpoisoned. (Needs further discussion)
> >>
> >> This certainly sounds like a closer fit - but I'm not very familiar with
> >> hwpoison so I don't know how easy it would be to integrate with this.
> >>
> >
> > We had similar discussions with Intel specific SEPT management and the
> > conclusion there was to just not hold refcounts and give a warning on
> > such failures [1].
> >
> > [1] https://lore.kernel.org/kvm/20250807094241.4523-1-yan.y.zhao@intel.com/
>
> So these paths (should) all warn already. I guess the question is
> whether we want the platform to limp on in these situations or not.
> Certainly converting the WARNs to BUG_ON would be very easy, but the
> intention here was to give the user some chance to save their work
> before killing the system.
>
> Just WARNing might be ok, but if the kernel allocates the page for one
> of it's data structures then it's effectively a BUG_ON - there's no
> reasonable recovery. Reallocation into user space we can sort of handle,
> but only by killing the process.

Makes sense, this scenario is different from TDX as the host will run
into GPT faults if accessing memory owned by the realm world with no
good way to recover. Let's try to find the right complexity to take on
when handling errors for such rare scenarios, without relying on
refcounts as we start looking into huge page support.

>
> >>> This page refcounting strategy will have to be revisited depending on
> >>> which series lands first. That being said, it would be great if ARM
> >>> could review/verify if the series [1] works for backing CCA VMs with
> >>> huge pages.
> >>>
> >>> [1] https://lore.kernel.org/kvm/cover.1747264138.git.ackerleytng@google.com/
> >>> [2] https://lpc.events/event/18/contributions/1764/

Re: [PATCH v9 19/43] arm64: RME: Allow populating initial contents

Posted by Vishal Annapurve 1 month, 3 weeks ago

On Fri, Aug 15, 2025 at 8:48 AM Steven Price <steven.price@arm.com> wrote:
>
> >>>> +static int populate_region(struct kvm *kvm,
> >>>> +                          phys_addr_t ipa_base,
> >>>> +                          phys_addr_t ipa_end,
> >>>> +                          unsigned long data_flags)
> >>>> +{
> >>>> +       struct realm *realm = &kvm->arch.realm;
> >>>> +       struct kvm_memory_slot *memslot;
> >>>> +       gfn_t base_gfn, end_gfn;
> >>>> +       int idx;
> >>>> +       phys_addr_t ipa = ipa_base;
> >>>> +       int ret = 0;
> >>>> +
> >>>> +       base_gfn = gpa_to_gfn(ipa_base);
> >>>> +       end_gfn = gpa_to_gfn(ipa_end);
> >>>> +
> >>>> +       idx = srcu_read_lock(&kvm->srcu);
> >>>> +       memslot = gfn_to_memslot(kvm, base_gfn);
> >>>> +       if (!memslot) {
> >>>> +               ret = -EFAULT;
> >>>> +               goto out;
> >>>> +       }
> >>>> +
> >>>> +       /* We require the region to be contained within a single memslot */
> >>>> +       if (memslot->base_gfn + memslot->npages < end_gfn) {
> >>>> +               ret = -EINVAL;
> >>>> +               goto out;
> >>>> +       }
> >>>> +
> >>>> +       if (!kvm_slot_can_be_private(memslot)) {
> >>>> +               ret = -EPERM;
> >>>> +               goto out;
> >>>> +       }
> >>>> +
> >>>> +       while (ipa < ipa_end) {
> >>>> +               struct vm_area_struct *vma;
> >>>> +               unsigned long hva;
> >>>> +               struct page *page;
> >>>> +               bool writeable;
> >>>> +               kvm_pfn_t pfn;
> >>>> +               kvm_pfn_t priv_pfn;
> >>>> +               struct page *gmem_page;
> >>>> +
> >>>> +               hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
> >>>> +               vma = vma_lookup(current->mm, hva);
> >>>> +               if (!vma) {
> >>>> +                       ret = -EFAULT;
> >>>> +                       break;
> >>>> +               }
> >>>> +
> >>>> +               pfn = __kvm_faultin_pfn(memslot, gpa_to_gfn(ipa), FOLL_WRITE,
> >>>> +                                       &writeable, &page);
> >>>
> >>> Is this assuming double backing of guest memory ranges? Is this logic
> >>> trying to simulate a shared fault?
> >>
> >> Yes and yes...
> >>
> >>> Does memory population work with CCA if priv_pfn and pfn are the same?
> >>> I am curious how the memory population will work with in-place
> >>> conversion support available for guest_memfd files.
> >>
> >> The RMM interface doesn't support an in-place conversion. The
> >> RMI_DATA_CREATE operation takes the PA of the already delegated
> >> granule[1] along with the PA of a non-delegated granule with the data.
> >
> > Ok, I think this will need a source buffer from userspace that is
> > outside guest_memfd once guest_memfd will support a single backing for
> > guest memory. You might want to simulate private access fault for
> > destination GPAs backed by guest_memfd ranges for this initial data
> > population -> similar to how memory population works today with TDX
> > VMs.
>
> Yes, that might well be the best option. At the moment I'm not sure what
> the best approach from the perspective of the VMM is. The current setup
> is nice because the VMM can populate the VM just like a normal non-coco
> setup, so we don't need to special-case anything. Requiring the VMM to
> load the initial contents into some other memory for the purpose of the
> populate call is a little ugly.

IIUC ARM CCA architecture requires two buffers for initial memory
population irrespective of whether we do it within kernel or in
userspace.

IMO, doing it in the kernel is uglier than doing it in userspace.
Baking the requirement of passing two buffers into userspace ABI for
populate seems cleaner to me.

>
> The other option is to just return to the double-memcpy() approach of
> emulating an in-place content-preserving populate by the kernel copying
> the data out of guest_memfd into a temporary page before doing the
> conversion and the RMM copying back from the temporary page. I worry
> about the performance of this, although it doesn't prevent the first
> option being added in the future if necessary. The big benefit is that
> it goes back to looking like a non-coco VM (but using guest_memfd).

Populate path is CoCo specific, I don't see the benefit of trying to
match the behavior with non-coco VMs in userspace.

>
> > Note that with single backing around, at least for x86, KVM
> > shared/private stage2 faults will always be served using guest_memfd
> > as the source of truth (i.e. not via userspace pagetables for shared
> > faults).
>
> Indeed, it's not yet clear to me whether we can only support this
> "single backing" world for CCA, or if it's going to be an option
> alongside using guest_memfd only for protected memory. Obviously we need
> the guest_memfd changes to land first too...

Having a dual backing support just for supporting the initial populate
seems overkill to me. How much payload in general are we talking about
in terms of percentage of total memory size?

IMO, dual backing for guest_memfd was a stopgap solution and should be
deprecated once we have single memory backing support.
CoCo VMs should default to using single backing going forward. Mmap
support [1] for guest_memfd is close to getting merged. In-place
conversion support is likely to follow soon.

[1] https://lore.kernel.org/all/20250729225455.670324-1-seanjc@google.com/

>
> At the moment I'm planning to post a v10 of this series soon, keeping
> the same API. But we will definitely want to support the new guest_memfd
> approach when it's ready.
>
> Thanks,
> Steve
>