[RFC PATCH v1 10/38] iommufd/vdevice: Add TSM map ioctl

Aneesh Kumar K.V (Arm) posted 38 patches 2 months, 1 week ago
[RFC PATCH v1 10/38] iommufd/vdevice: Add TSM map ioctl
Posted by Aneesh Kumar K.V (Arm) 2 months, 1 week ago
With passthrough devices, we need to make sure private memory is
allocated and assigned to the secure guest before we can issue the DMA.
For ARM RMM, we only need to map and the secure SMMU management is
internal to RMM. For shared IPA, vfio/iommufd DMA MAP/UNMAP interface
does the equivalent

Signed-off-by: Aneesh Kumar K.V (Arm) <aneesh.kumar@kernel.org>
---
 arch/arm64/kvm/mmu.c                    | 45 +++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h |  1 +
 drivers/iommu/iommufd/main.c            |  4 +++
 drivers/iommu/iommufd/viommu.c          | 43 +++++++++++++++++++++++
 include/linux/kvm_host.h                |  1 +
 include/uapi/linux/iommufd.h            | 10 ++++++
 6 files changed, 104 insertions(+)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index c866891fd8f9..8788d24095d6 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1530,6 +1530,51 @@ static int realm_map_ipa(struct kvm *kvm, phys_addr_t ipa,
 	return realm_map_protected(realm, ipa, pfn, map_size, memcache);
 }
 
+int kvm_map_private_memory(struct kvm *kvm, phys_addr_t start_gpa,
+			   phys_addr_t end_gpa)
+{
+	struct kvm_mmu_memory_cache cache = { .gfp_zero = __GFP_ZERO };
+	struct kvm_s2_mmu *mmu = &kvm->arch.mmu;
+	struct kvm_memory_slot *memslot;
+	phys_addr_t addr;
+	struct page *page;
+	kvm_pfn_t pfn;
+	int ret = 0, idx;
+	gfn_t gfn;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	for (addr = start_gpa; addr < end_gpa; addr += PAGE_SIZE) {
+
+		ret = kvm_mmu_topup_memory_cache(&cache,
+						 kvm_mmu_cache_min_pages(mmu));
+		if (ret)
+			break;
+
+		gfn = addr >> PAGE_SHIFT;
+
+		memslot = gfn_to_memslot(kvm, gfn);
+		if (!kvm_slot_can_be_private(memslot)) {
+			ret = -EINVAL;
+			break;
+		}
+		/* should we check if kvm_mem_is_private()? */
+		ret = kvm_gmem_get_pfn(kvm, memslot, gfn, &pfn, &page, NULL);
+		if (ret)
+			break;
+
+		/* should we hold kvm_fault_lock()? */
+		ret = realm_map_ipa(kvm, addr, pfn, PAGE_SIZE, KVM_PGTABLE_PROT_W,
+				    &cache);
+		if (ret) {
+			put_page(page);
+			break;
+		}
+	}
+	kvm_mmu_free_memory_cache(&cache);
+	srcu_read_unlock(&kvm->srcu, idx);
+	return ret;
+}
+
 static int private_memslot_fault(struct kvm_vcpu *vcpu,
 				 phys_addr_t fault_ipa,
 				 struct kvm_memory_slot *memslot)
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 0c0d96135432..34f3ae0e0cd1 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -698,6 +698,7 @@ void iommufd_vdevice_abort(struct iommufd_object *obj);
 int iommufd_hw_queue_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_hw_queue_destroy(struct iommufd_object *obj);
 int iommufd_vdevice_tsm_op_ioctl(struct iommufd_ucmd *ucmd);
+int iommufd_vdevice_tsm_map_ioctl(struct iommufd_ucmd *ucmd);
 int iommufd_vdevice_tsm_guest_request_ioctl(struct iommufd_ucmd *ucmd);
 
 static inline struct iommufd_vdevice *
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 65e60da9caef..388d11334994 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -411,6 +411,7 @@ union ucmd_buffer {
 	struct iommu_vfio_ioas vfio_ioas;
 	struct iommu_viommu_alloc viommu;
 	struct iommu_vdevice_tsm_op tsm_op;
+	struct iommu_vdevice_tsm_map tsm_map;
 	struct iommu_vdevice_tsm_guest_request gr;
 #ifdef CONFIG_IOMMUFD_TEST
 	struct iommu_test_cmd test;
@@ -475,6 +476,9 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 struct iommu_viommu_alloc, out_viommu_id),
 	IOCTL_OP(IOMMU_VDEVICE_TSM_OP, iommufd_vdevice_tsm_op_ioctl,
 		 struct iommu_vdevice_tsm_op, vdevice_id),
+	IOCTL_OP(IOMMU_VDEVICE_TSM_MAP, iommufd_vdevice_tsm_map_ioctl,
+		 struct iommu_vdevice_tsm_map, vdevice_id),
+
 	IOCTL_OP(IOMMU_VDEVICE_TSM_GUEST_REQUEST, iommufd_vdevice_tsm_guest_request_ioctl,
 		 struct iommu_vdevice_tsm_guest_request, resp_uptr),
 #ifdef CONFIG_IOMMUFD_TEST
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 9f4d4d69b82b..1ffc996caa3e 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -516,6 +516,44 @@ int iommufd_vdevice_tsm_op_ioctl(struct iommufd_ucmd *ucmd)
 	return rc;
 }
 
+int __weak kvm_map_private_memory(struct kvm *kvm, phys_addr_t start_gpa,
+				  phys_addr_t end_gpa)
+{
+	return -EINVAL;
+}
+
+int iommufd_vdevice_tsm_map_ioctl(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_vdevice_tsm_map *cmd = ucmd->cmd;
+	struct iommufd_vdevice *vdev;
+	struct kvm *kvm;
+	int rc = -ENODEV;
+
+	if (cmd->flags)
+		return -EOPNOTSUPP;
+
+	vdev = container_of(iommufd_get_object(ucmd->ictx, cmd->vdevice_id,
+					       IOMMUFD_OBJ_VDEVICE),
+			    struct iommufd_vdevice, obj);
+	if (IS_ERR(vdev))
+		return PTR_ERR(vdev);
+
+	kvm = vdev->viommu->kvm_filp->private_data;
+	if (kvm) {
+		rc = kvm_map_private_memory(kvm, cmd->start_gpa, cmd->end_gpa);
+		if (rc)
+			goto out_put_vdev;
+
+	} else {
+		goto out_put_vdev;
+	}
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+
+out_put_vdev:
+	iommufd_put_object(ucmd->ictx, &vdev->obj);
+	return rc;
+}
+
 int iommufd_vdevice_tsm_guest_request_ioctl(struct iommufd_ucmd *ucmd)
 {
 	struct iommu_vdevice_tsm_guest_request *cmd = ucmd->cmd;
@@ -555,6 +593,11 @@ int iommufd_vdevice_tsm_op_ioctl(struct iommufd_ucmd *ucmd)
 	return -ENODEV;
 }
 
+int iommufd_vdevice_tsm_map_ioctl(struct iommufd_ucmd *ucmd)
+{
+	return -ENODEV;
+}
+
 int iommufd_vdevice_tsm_guest_request_ioctl(struct iommufd_ucmd *ucmd)
 {
 	return -ENODEV;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3bde4fb5c6aa..bfdfb4f32d28 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -2602,4 +2602,5 @@ static inline int kvm_enable_virtualization(void) { return 0; }
 static inline void kvm_disable_virtualization(void) { }
 #endif
 
+int kvm_map_private_memory(struct kvm *kvm, phys_addr_t start_gpa, phys_addr_t end_gpa);
 #endif
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 56542cfcfa38..75056d1f141d 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -59,6 +59,7 @@ enum {
 	IOMMUFD_CMD_HW_QUEUE_ALLOC = 0x94,
 	IOMMUFD_CMD_VDEVICE_TSM_OP = 0x95,
 	IOMMUFD_CMD_VDEVICE_TSM_GUEST_REQUEST = 0x96,
+	IOMMUFD_CMD_VDEVICE_TSM_MAP = 0x97,
 };
 
 /**
@@ -1146,6 +1147,15 @@ struct iommu_vdevice_tsm_op {
 #define IOMMU_VDEICE_TSM_BIND		0x1
 #define IOMMU_VDEICE_TSM_UNBIND		0x2
 
+struct iommu_vdevice_tsm_map {
+	__u32 size;
+	__u32 flags;
+	__u64 start_gpa;
+	__u64 end_gpa;
+	__u32 vdevice_id;
+};
+#define IOMMU_VDEVICE_TSM_MAP	_IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_TSM_MAP)
+
 /**
  * struct iommufd_vevent_header - Virtual Event Header for a vEVENTQ Status
  * @flags: Combination of enum iommu_veventq_flag
-- 
2.43.0
Re: [RFC PATCH v1 10/38] iommufd/vdevice: Add TSM map ioctl
Posted by Jason Gunthorpe 2 months, 1 week ago
On Mon, Jul 28, 2025 at 07:21:47PM +0530, Aneesh Kumar K.V (Arm) wrote:
> With passthrough devices, we need to make sure private memory is
> allocated and assigned to the secure guest before we can issue the DMA.
> For ARM RMM, we only need to map and the secure SMMU management is
> internal to RMM. For shared IPA, vfio/iommufd DMA MAP/UNMAP interface
> does the equivalent

I'm not really sure what this is about? It is about getting KVM to pin
all the memory and commit it to the RMM so it can be used for DMA?

But it looks really strange to have an iommufd ioctl that just calls a
KVM function. Feeling this should be a KVM function, or a guestmfd
behavior??

I was kind of thinking it would be nice to have a guestmemfd mode that
was "pinned", meaning the memory is allocated and remains almost
always mapped into the TSM's page tables automatically. VFIO using
guests would set things this way.

Jason
Re: [RFC PATCH v1 10/38] iommufd/vdevice: Add TSM map ioctl
Posted by Alexey Kardashevskiy 2 months ago

On 28/7/25 19:47, Jason Gunthorpe wrote:
> On Mon, Jul 28, 2025 at 07:21:47PM +0530, Aneesh Kumar K.V (Arm) wrote:
>> With passthrough devices, we need to make sure private memory is
>> allocated and assigned to the secure guest before we can issue the DMA.
>> For ARM RMM, we only need to map and the secure SMMU management is
>> internal to RMM. For shared IPA, vfio/iommufd DMA MAP/UNMAP interface
>> does the equivalent
> 
> I'm not really sure what this is about? It is about getting KVM to pin
> all the memory and commit it to the RMM so it can be used for DMA?
> 
> But it looks really strange to have an iommufd ioctl that just calls a
> KVM function. Feeling this should be a KVM function, or a guestmfd
> behavior??


I ended up exporting the guestmemfd's kvm_gmem_get_folio() for gfn->pfn and its fd a bit differently in iommufd - "no extra referencing":
https://github.com/AMDESE/linux-kvm/commit/f1ebd358327f026f413f8d3d64d46decfd6ab7f6

It is a new iommufd->kvm dependency though.

> I was kind of thinking it would be nice to have a guestmemfd mode that
> was "pinned", meaning the memory is allocated and remains almost
> always mapped into the TSM's page tables automatically. VFIO using
> guests would set things this way.

Yeah while doing the above, I was wondering if I want to pass the fd type when DMA-mapping from an fd or "detect" it as I do in the above commit or have some iommufd_fdmap_ops in this fd saying "(no) pinning needed" (or make this a flag of IOMMU_IOAS_MAP_FILE).

The "detection" is (mapping_inaccessible(mapping) && mapping_unevictable(mapping)), works for now.

btw in the AMD case, here it does not matter as much if it is private or shared, I map everything and let RMP and the VM deal with the permissions. Thanks,


> 
> Jason

-- 
Alexey
Re: [RFC PATCH v1 10/38] iommufd/vdevice: Add TSM map ioctl
Posted by Aneesh Kumar K.V 2 months ago
Alexey Kardashevskiy <aik@amd.com> writes:

> On 28/7/25 19:47, Jason Gunthorpe wrote:
>> On Mon, Jul 28, 2025 at 07:21:47PM +0530, Aneesh Kumar K.V (Arm) wrote:
>>> With passthrough devices, we need to make sure private memory is
>>> allocated and assigned to the secure guest before we can issue the DMA.
>>> For ARM RMM, we only need to map and the secure SMMU management is
>>> internal to RMM. For shared IPA, vfio/iommufd DMA MAP/UNMAP interface
>>> does the equivalent
>> 
>> I'm not really sure what this is about? It is about getting KVM to pin
>> all the memory and commit it to the RMM so it can be used for DMA?
>> 
>> But it looks really strange to have an iommufd ioctl that just calls a
>> KVM function. Feeling this should be a KVM function, or a guestmfd
>> behavior??
>
>
> I ended up exporting the guestmemfd's kvm_gmem_get_folio() for gfn->pfn and its fd a bit differently in iommufd - "no extra referencing":
> https://github.com/AMDESE/linux-kvm/commit/f1ebd358327f026f413f8d3d64d46decfd6ab7f6
>
> It is a new iommufd->kvm dependency though.
>

Was the motivation for that design choice the fact that in case of AMD
VFIO/IOMMUFD manages both private memory allocation and updates to the
IOMMU page tables?

On the ARM side, the requirement is to ensure that pages are present in
the stage-2 page table, which is managed by the firmware (RMM). Because
of this, we need an interface that VFIO/IOMMUFD can use to trigger
stage-2 mappings within KVM.

Alternatively, we could introduce a dedicated KVM ioctl for this
purpose, avoiding the need to rely on IOMMUFD.

For reference, TDX uses a similar ioctl—`KVM_TDX_INIT_MEM_REGION`—to
initialize guest memory. However, that interface isn’t well-suited for
dynamic updates to stage-2 mappings during shared-to-private or
private-to-shared transitions.


>
>> I was kind of thinking it would be nice to have a guestmemfd mode that
>> was "pinned", meaning the memory is allocated and remains almost
>> always mapped into the TSM's page tables automatically. VFIO using
>> guests would set things this way.
>
> Yeah while doing the above, I was wondering if I want to pass the fd type when DMA-mapping from an fd or "detect" it as I do in the above commit or have some iommufd_fdmap_ops in this fd saying "(no) pinning needed" (or make this a flag of IOMMU_IOAS_MAP_FILE).
>
> The "detection" is (mapping_inaccessible(mapping) && mapping_unevictable(mapping)), works for now.
>
> btw in the AMD case, here it does not matter as much if it is private or shared, I map everything and let RMP and the VM deal with the permissions. Thanks,
>
>

-aneesh
Re: [RFC PATCH v1 10/38] iommufd/vdevice: Add TSM map ioctl
Posted by Alexey Kardashevskiy 2 months ago

On 4/8/25 13:58, Aneesh Kumar K.V wrote:
> Alexey Kardashevskiy <aik@amd.com> writes:
> 
>> On 28/7/25 19:47, Jason Gunthorpe wrote:
>>> On Mon, Jul 28, 2025 at 07:21:47PM +0530, Aneesh Kumar K.V (Arm) wrote:
>>>> With passthrough devices, we need to make sure private memory is
>>>> allocated and assigned to the secure guest before we can issue the DMA.
>>>> For ARM RMM, we only need to map and the secure SMMU management is
>>>> internal to RMM. For shared IPA, vfio/iommufd DMA MAP/UNMAP interface
>>>> does the equivalent
>>>
>>> I'm not really sure what this is about? It is about getting KVM to pin
>>> all the memory and commit it to the RMM so it can be used for DMA?
>>>
>>> But it looks really strange to have an iommufd ioctl that just calls a
>>> KVM function. Feeling this should be a KVM function, or a guestmfd
>>> behavior??
>>
>>
>> I ended up exporting the guestmemfd's kvm_gmem_get_folio() for gfn->pfn and its fd a bit differently in iommufd - "no extra referencing":
>> https://github.com/AMDESE/linux-kvm/commit/f1ebd358327f026f413f8d3d64d46decfd6ab7f6
>>
>> It is a new iommufd->kvm dependency though.
>>
> 
> Was the motivation for that design choice the fact that in case of AMD
> VFIO/IOMMUFD manages both private memory allocation and updates to the
> IOMMU page tables?

IOMMUFD maps pages for DMA in the IOMMU pagetable, let it do just that.


> On the ARM side, the requirement is to ensure that pages are present in
> the stage-2 page table, which is managed by the firmware (RMM). Because
> of this, we need an interface that VFIO/IOMMUFD can use to trigger
> stage-2 mappings within KVM.
> 
> Alternatively, we could introduce a dedicated KVM ioctl for this
> purpose, avoiding the need to rely on IOMMUFD.

Right, if there is a requirement like this, and QEMU can do just another ioctl() - then I just do that, helps to untangle all these kernel module references. It is the firmware which makes sure that page tables are in sync so no much point teaching KVM about it imho, DMA map requests cannot go past QEMU anyway. Thanks,


> 
> For reference, TDX uses a similar ioctl—`KVM_TDX_INIT_MEM_REGION`—to
> initialize guest memory. However, that interface isn’t well-suited for
> dynamic updates to stage-2 mappings during shared-to-private or
> private-to-shared transitions.
> 
> 
>>
>>> I was kind of thinking it would be nice to have a guestmemfd mode that
>>> was "pinned", meaning the memory is allocated and remains almost
>>> always mapped into the TSM's page tables automatically. VFIO using
>>> guests would set things this way.
>>
>> Yeah while doing the above, I was wondering if I want to pass the fd type when DMA-mapping from an fd or "detect" it as I do in the above commit or have some iommufd_fdmap_ops in this fd saying "(no) pinning needed" (or make this a flag of IOMMU_IOAS_MAP_FILE).
>>
>> The "detection" is (mapping_inaccessible(mapping) && mapping_unevictable(mapping)), works for now.
>>
>> btw in the AMD case, here it does not matter as much if it is private or shared, I map everything and let RMP and the VM deal with the permissions. Thanks,
>>
>>
> 
> -aneesh

-- 
Alexey

Re: [RFC PATCH v1 10/38] iommufd/vdevice: Add TSM map ioctl
Posted by Jason Gunthorpe 2 months ago
On Mon, Aug 04, 2025 at 08:02:23AM +0530, Alexey Kardashevskiy wrote:
> I ended up exporting the guestmemfd's kvm_gmem_get_folio() for gfn->pfn and its fd a bit differently in iommufd - "no extra referencing":
> https://github.com/AMDESE/linux-kvm/commit/f1ebd358327f026f413f8d3d64d46decfd6ab7f6

This patch needs to explain how the lifecylce works and why the IOMMU
can't UAF the memory. I think it cannot really work as shown without
more things like an invalidation callback.

IOW you need to define for how long the return result of
guest_memfd_get_pfn() is valid for.

> > I was kind of thinking it would be nice to have a guestmemfd mode that
> > was "pinned", meaning the memory is allocated and remains almost
> > always mapped into the TSM's page tables automatically. VFIO using
> > guests would set things this way.
> 
> Yeah while doing the above, I was wondering if I want to pass the fd
> type when DMA-mapping from an fd or "detect" it as I do in the above
> commit or have some iommufd_fdmap_ops in this fd saying "(no)
> pinning needed" (or make this a flag of IOMMU_IOAS_MAP_FILE).

It should be autodetected.

Since this is unique behavior for guestmemfd it is fine to start
there..

> btw in the AMD case, here it does not matter as much if it is
> private or shared, I map everything and let RMP and the VM deal with
> the permissions. Thanks,

I think ARM would like to do this as well.

Hence my suggestion that guestmemfd could just have unchanging 1G PFNs
and all shared/private changes have no impact on iommufd.

If so likely all this needs is an invalidation callback from
guestmemfd to iommufd to revoke on ftruncate.

Jason
Re: [RFC PATCH v1 10/38] iommufd/vdevice: Add TSM map ioctl
Posted by Aneesh Kumar K.V 2 months, 1 week ago
Jason Gunthorpe <jgg@ziepe.ca> writes:

> On Mon, Jul 28, 2025 at 07:21:47PM +0530, Aneesh Kumar K.V (Arm) wrote:
>> With passthrough devices, we need to make sure private memory is
>> allocated and assigned to the secure guest before we can issue the DMA.
>> For ARM RMM, we only need to map and the secure SMMU management is
>> internal to RMM. For shared IPA, vfio/iommufd DMA MAP/UNMAP interface
>> does the equivalent
>
> I'm not really sure what this is about? It is about getting KVM to pin
> all the memory and commit it to the RMM so it can be used for DMA?
>

That is correct.

>
> But it looks really strange to have an iommufd ioctl that just calls a
> KVM function. Feeling this should be a KVM function, or a guestmfd
> behavior??
>
This functionality is equivalent to `IOMMU_IOAS_MAP`, but in the
presence of firmware like RMM, we also need to supply the realm
descriptor associated with the KVM instance.

Initially, I attempted to handle this within the `map_pages` callback in
`iommu_domain_ops`, but that path lacks any awareness of the associated
KVM context, making it unsuitable for this purpose.


> I was kind of thinking it would be nice to have a guestmemfd mode that
> was "pinned", meaning the memory is allocated and remains almost
> always mapped into the TSM's page tables automatically. VFIO using
> guests would set things this way.
>

We need to allocate and free these pages dynamically as they are
converted between private and shared states.

-aneesh
Re: [RFC PATCH v1 10/38] iommufd/vdevice: Add TSM map ioctl
Posted by Jason Gunthorpe 2 months, 1 week ago
On Tue, Jul 29, 2025 at 02:07:55PM +0530, Aneesh Kumar K.V wrote:
> > But it looks really strange to have an iommufd ioctl that just calls a
> > KVM function. Feeling this should be a KVM function, or a guestmfd
> > behavior??
> >
> This functionality is equivalent to `IOMMU_IOAS_MAP`, but in the
> presence of firmware like RMM, we also need to supply the realm
> descriptor associated with the KVM instance.

There is no IOAS here because the secure world is using the KVM page
table. Since KVM owns this I don't see why iommufd should be invovled
in any way.

You need KVM to push the guestmemfd to the RMM and pin all the memory.

> > I was kind of thinking it would be nice to have a guestmemfd mode that
> > was "pinned", meaning the memory is allocated and remains almost
> > always mapped into the TSM's page tables automatically. VFIO using
> > guests would set things this way.
> 
> We need to allocate and free these pages dynamically as they are
> converted between private and shared states.

That's still within guestmemfd's area of concern and it can
immediately pin on state changes.

Jason