Introduce a new pv interface to manage the underlying IOMMU and manage contexts
and devices. This interface allows creation of new contexts from Dom0 and
addition of IOMMU mappings using guest PoV.
This interface doesn't allow creation of mapping to other domains.
Signed-off-by Teddy Astie <teddy.astie@vates.tech>
---
Changed in V2:
* formatting
Changed in V3:
* prevent IOMMU operations on dying contexts
Changed in V4:
* redesigned hypercall interface [1]
* added remote_cmd and init logic
[1] https://lore.kernel.org/all/fdfa32c9-c177-4d05-891a-138f9b663f19@vates.tech/
---
xen/common/Makefile | 1 +
xen/common/pv-iommu.c | 539 ++++++++++++++++++++++++++++++++++
xen/include/hypercall-defs.c | 6 +
xen/include/public/pv-iommu.h | 341 +++++++++++++++++++++
xen/include/public/xen.h | 1 +
5 files changed, 888 insertions(+)
create mode 100644 xen/common/pv-iommu.c
create mode 100644 xen/include/public/pv-iommu.h
diff --git a/xen/common/Makefile b/xen/common/Makefile
index fc52e0857d..9d642ef635 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -58,6 +58,7 @@ obj-y += wait.o
obj-bin-y += warning.init.o
obj-$(CONFIG_XENOPROF) += xenoprof.o
obj-y += xmalloc_tlsf.o
+obj-y += pv-iommu.o
obj-bin-$(CONFIG_X86) += $(foreach n,decompress bunzip2 unxz unlzma lzo unlzo unlz4 unzstd earlycpio,$(n).init.o)
diff --git a/xen/common/pv-iommu.c b/xen/common/pv-iommu.c
new file mode 100644
index 0000000000..9c7d04b4c7
--- /dev/null
+++ b/xen/common/pv-iommu.c
@@ -0,0 +1,539 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * xen/common/pv_iommu.c
+ *
+ * PV-IOMMU hypercall interface.
+ */
+
+#include <xen/errno.h>
+#include <xen/mm.h>
+#include <xen/lib.h>
+#include <xen/iommu.h>
+#include <xen/sched.h>
+#include <xen/iocap.h>
+#include <xen/mm-frame.h>
+#include <xen/pci.h>
+#include <xen/guest_access.h>
+#include <asm/p2m.h>
+#include <asm/event.h>
+#include <asm/mm.h>
+#include <asm/iommu.h>
+#include <public/pv-iommu.h>
+
+#define PVIOMMU_PREFIX "[PV-IOMMU] "
+
+static int get_paged_frame(struct domain *d, gfn_t gfn, mfn_t *mfn,
+ struct page_info **page, bool readonly)
+{
+ int ret = 0;
+ p2m_type_t p2mt = p2m_invalid;
+
+ #ifdef CONFIG_X86
+ p2m_query_t query = P2M_ALLOC;
+
+ if ( !readonly )
+ query |= P2M_UNSHARE;
+
+ *mfn = get_gfn_type(d, gfn_x(gfn), &p2mt, query);
+ #else
+ *mfn = p2m_lookup(d, gfn, &p2mt);
+ #endif
+
+ if ( mfn_eq(*mfn, INVALID_MFN) )
+ {
+ /* No mapping ? */
+ printk(XENLOG_G_WARNING PVIOMMU_PREFIX
+ "Trying to map to non-backed page frame (gfn=%"PRI_gfn
+ " p2mt=%d d%d)\n", gfn_x(gfn), p2mt, d->domain_id);
+
+ ret = -ENOENT;
+ }
+ else if ( p2m_is_any_ram(p2mt) && mfn_valid(*mfn) )
+ {
+ *page = get_page_from_mfn(*mfn, d);
+ ret = 0;
+ }
+ else if ( p2m_is_mmio(p2mt) ||
+ iomem_access_permitted(d, mfn_x(*mfn),mfn_x(*mfn)) )
+ {
+ *page = NULL;
+ ret = 0;
+ }
+ else
+ {
+ printk(XENLOG_G_WARNING PVIOMMU_PREFIX
+ "Unexpected p2mt %d (d%d gfn=%"PRI_gfn" mfn=%"PRI_mfn")\n",
+ p2mt, d->domain_id, gfn_x(gfn), mfn_x(*mfn));
+
+ ret = -EPERM;
+ }
+
+ put_gfn(d, gfn_x(gfn));
+ return ret;
+}
+
+static bool can_use_iommu_check(struct domain *d)
+{
+ if ( !is_iommu_enabled(d) )
+ {
+ printk(XENLOG_G_WARNING PVIOMMU_PREFIX
+ "IOMMU disabled for this domain\n");
+ return false;
+ }
+
+ if ( !dom_iommu(d)->allow_pv_iommu )
+ {
+ printk(XENLOG_G_WARNING PVIOMMU_PREFIX
+ "PV-IOMMU disabled for this domain\n");
+ return false;
+ }
+
+ return true;
+}
+
+static long capabilities_op(struct pv_iommu_capabilities *cap, struct domain *d)
+{
+ cap->max_ctx_no = d->iommu.other_contexts.count;
+ cap->max_iova_addr = iommu_get_max_iova(d);
+
+ cap->max_pasid = 0; /* TODO */
+ cap->cap_flags = 0;
+
+ if ( !dom_iommu(d)->no_dma )
+ cap->cap_flags |= IOMMUCAP_default_identity;
+
+ cap->pgsize_mask = PAGE_SIZE_4K;
+
+ return 0;
+}
+
+static long init_op(struct pv_iommu_init *init, struct domain *d)
+{
+ if (init->max_ctx_no == UINT32_MAX)
+ return -E2BIG;
+
+ return iommu_domain_pviommu_init(d, init->max_ctx_no + 1, init->arena_order);
+}
+
+static long alloc_context_op(struct pv_iommu_alloc *alloc, struct domain *d)
+{
+ u16 ctx_no = 0;
+ int status = 0;
+
+ status = iommu_context_alloc(d, &ctx_no, 0);
+
+ if ( status )
+ return status;
+
+ printk(XENLOG_G_INFO PVIOMMU_PREFIX
+ "Created IOMMU context %hu in d%d\n", ctx_no, d->domain_id);
+
+ alloc->ctx_no = ctx_no;
+ return 0;
+}
+
+static long free_context_op(struct pv_iommu_free *free, struct domain *d)
+{
+ int flags = IOMMU_TEARDOWN_PREEMPT;
+
+ if ( !free->ctx_no )
+ return -EINVAL;
+
+ if ( free->free_flags & IOMMU_FREE_reattach_default )
+ flags |= IOMMU_TEARDOWN_REATTACH_DEFAULT;
+
+ return iommu_context_free(d, free->ctx_no, flags);
+}
+
+static long reattach_device_op(struct pv_iommu_reattach_device *reattach,
+ struct domain *d)
+{
+ int ret;
+ device_t *pdev;
+ struct physdev_pci_device dev = reattach->dev;
+
+ pcidevs_lock();
+ pdev = pci_get_pdev(d, PCI_SBDF(dev.seg, dev.bus, dev.devfn));
+
+ if ( !pdev )
+ {
+ pcidevs_unlock();
+ return -ENOENT;
+ }
+
+ ret = iommu_reattach_context(d, d, pdev, reattach->ctx_no);
+
+ pcidevs_unlock();
+ return ret;
+}
+
+static long map_pages_op(struct pv_iommu_map_pages *map, struct domain *d)
+{
+ struct iommu_context *ctx;
+ int ret = 0, flush_ret;
+ struct page_info *page = NULL;
+ mfn_t mfn, mfn_lookup;
+ unsigned int flags = 0, flush_flags = 0;
+ size_t i = 0;
+ dfn_t dfn0 = _dfn(map->dfn); /* original map->dfn */
+
+ if ( !map->ctx_no || !(ctx = iommu_get_context(d, map->ctx_no)) )
+ return -EINVAL;
+
+ if ( map->map_flags & IOMMU_MAP_readable )
+ flags |= IOMMUF_readable;
+
+ if ( map->map_flags & IOMMU_MAP_writeable )
+ flags |= IOMMUF_writable;
+
+ for (i = 0; i < map->nr_pages; i++)
+ {
+ gfn_t gfn = _gfn(map->gfn + i);
+ dfn_t dfn = _dfn(map->dfn + i);
+
+#ifdef CONFIG_X86
+ if ( iommu_identity_map_check(d, ctx, _mfn(map->dfn)) )
+ {
+ ret = -EADDRNOTAVAIL;
+ break;
+ }
+#endif
+
+ ret = get_paged_frame(d, gfn, &mfn, &page, 0);
+
+ if ( ret )
+ break;
+
+ /* Check for conflict with existing mappings */
+ if ( !iommu_lookup_page(d, dfn, &mfn_lookup, &flags, map->ctx_no) )
+ {
+ if ( page )
+ put_page(page);
+
+ ret = -EADDRINUSE;
+ break;
+ }
+
+ ret = iommu_map(d, dfn, mfn, 1, flags, &flush_flags, map->ctx_no);
+
+ if ( ret )
+ {
+ if ( page )
+ put_page(page);
+
+ break;
+ }
+
+ map->mapped++;
+
+ if ( (i & 0xff) && hypercall_preempt_check() )
+ {
+ i++;
+
+ map->gfn += i;
+ map->dfn += i;
+ map->nr_pages -= i;
+
+ ret = -ERESTART;
+ break;
+ }
+ }
+
+ flush_ret = iommu_iotlb_flush(d, dfn0, i, flush_flags, map->ctx_no);
+
+ iommu_put_context(ctx);
+
+ if ( flush_ret )
+ printk(XENLOG_G_WARNING PVIOMMU_PREFIX
+ "Flush operation failed for d%dc%d (%d)\n", d->domain_id,
+ ctx->id, flush_ret);
+
+ return ret;
+}
+
+static long unmap_pages_op(struct pv_iommu_unmap_pages *unmap, struct domain *d)
+{
+ struct iommu_context *ctx;
+ mfn_t mfn;
+ int ret = 0, flush_ret;
+ unsigned int flags, flush_flags = 0;
+ size_t i = 0;
+ dfn_t dfn0 = _dfn(unmap->dfn); /* original unmap->dfn */
+
+ if ( !unmap->ctx_no || !(ctx = iommu_get_context(d, unmap->ctx_no)) )
+ return -EINVAL;
+
+ for (i = 0; i < unmap->nr_pages; i++)
+ {
+ dfn_t dfn = _dfn(unmap->dfn + i);
+
+#ifdef CONFIG_X86
+ if ( iommu_identity_map_check(d, ctx, _mfn(unmap->dfn)) )
+ {
+ ret = -EADDRNOTAVAIL;
+ break;
+ }
+#endif
+
+ /* Check if there is a valid mapping for this domain */
+ if ( iommu_lookup_page(d, dfn, &mfn, &flags, unmap->ctx_no) ) {
+ ret = -ENOENT;
+ break;
+ }
+
+ ret = iommu_unmap(d, dfn, 1, 0, &flush_flags, unmap->ctx_no);
+
+ if ( ret )
+ break;
+
+ unmap->unmapped++;
+
+ /* Decrement reference counter (if needed) */
+ if ( mfn_valid(mfn) )
+ put_page(mfn_to_page(mfn));
+
+ if ( (i & 0xff) && hypercall_preempt_check() )
+ {
+ i++;
+
+ unmap->dfn += i;
+ unmap->nr_pages -= i;
+
+ ret = -ERESTART;
+ break;
+ }
+ }
+
+ flush_ret = iommu_iotlb_flush(d, dfn0, i, flush_flags, unmap->ctx_no);
+
+ iommu_put_context(ctx);
+
+ if ( flush_ret )
+ printk(XENLOG_WARNING PVIOMMU_PREFIX
+ "Flush operation failed for d%dc%d (%d)\n", d->domain_id,
+ ctx->id, flush_ret);
+
+ return ret;
+}
+
+static long do_iommu_subop(int subop, XEN_GUEST_HANDLE_PARAM(void) arg,
+ struct domain *d, bool remote);
+
+static long remote_cmd_op(struct pv_iommu_remote_cmd *remote_cmd,
+ struct domain *current_domain)
+{
+ long ret = 0;
+ struct domain *d;
+
+ /* TODO: use a better permission logic */
+ if ( !is_hardware_domain(current_domain) )
+ return -EPERM;
+
+ d = get_domain_by_id(remote_cmd->domid);
+
+ if ( !d )
+ return -ENOENT;
+
+ ret = do_iommu_subop(remote_cmd->subop, remote_cmd->arg, d, true);
+
+ put_domain(d);
+
+ return ret;
+}
+
+static long do_iommu_subop(int subop, XEN_GUEST_HANDLE_PARAM(void) arg,
+ struct domain *d, bool remote)
+{
+ long ret = 0;
+
+ switch ( subop )
+ {
+ case IOMMU_noop:
+ break;
+
+ case IOMMU_query_capabilities:
+ {
+ struct pv_iommu_capabilities cap;
+
+ ret = capabilities_op(&cap, d);
+
+ if ( unlikely(copy_to_guest(arg, &cap, 1)) )
+ ret = -EFAULT;
+
+ break;
+ }
+
+ case IOMMU_init:
+ {
+ struct pv_iommu_init init;
+
+ if ( unlikely(copy_from_guest(&init, arg, 1)) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = init_op(&init, d);
+ }
+
+ case IOMMU_alloc_context:
+ {
+ struct pv_iommu_alloc alloc;
+
+ if ( unlikely(copy_from_guest(&alloc, arg, 1)) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = alloc_context_op(&alloc, d);
+
+ if ( unlikely(copy_to_guest(arg, &alloc, 1)) )
+ ret = -EFAULT;
+
+ break;
+ }
+
+ case IOMMU_free_context:
+ {
+ struct pv_iommu_free free;
+
+ if ( unlikely(copy_from_guest(&free, arg, 1)) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = free_context_op(&free, d);
+ break;
+ }
+
+ case IOMMU_reattach_device:
+ {
+ struct pv_iommu_reattach_device reattach;
+
+ if ( unlikely(copy_from_guest(&reattach, arg, 1)) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = reattach_device_op(&reattach, d);
+ break;
+ }
+
+ case IOMMU_map_pages:
+ {
+ struct pv_iommu_map_pages map;
+
+ if ( unlikely(copy_from_guest(&map, arg, 1)) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = map_pages_op(&map, d);
+
+ if ( unlikely(copy_to_guest(arg, &map, 1)) )
+ ret = -EFAULT;
+
+ break;
+ }
+
+ case IOMMU_unmap_pages:
+ {
+ struct pv_iommu_unmap_pages unmap;
+
+ if ( unlikely(copy_from_guest(&unmap, arg, 1)) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = unmap_pages_op(&unmap, d);
+
+ if ( unlikely(copy_to_guest(arg, &unmap, 1)) )
+ ret = -EFAULT;
+
+ break;
+ }
+
+ case IOMMU_remote_cmd:
+ {
+ struct pv_iommu_remote_cmd remote_cmd;
+
+ if ( remote )
+ {
+ /* Prevent remote_cmd from being called recursively */
+ ret = -EINVAL;
+ break;
+ }
+
+ if ( unlikely(copy_from_guest(&remote_cmd, arg, 1)) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+
+ ret = remote_cmd_op(&remote_cmd, d);
+ break;
+ }
+
+ /*
+ * TODO
+ */
+ case IOMMU_alloc_nested:
+ {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ case IOMMU_flush_nested:
+ {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ case IOMMU_attach_pasid:
+ {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ case IOMMU_detach_pasid:
+ {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return ret;
+}
+
+long do_iommu_op(unsigned int subop, XEN_GUEST_HANDLE_PARAM(void) arg)
+{
+ long ret = 0;
+
+ if ( !can_use_iommu_check(current->domain) )
+ return -ENODEV;
+
+ ret = do_iommu_subop(subop, arg, current->domain, false);
+
+ if ( ret == -ERESTART )
+ return hypercall_create_continuation(__HYPERVISOR_iommu_op, "ih", subop, arg);
+
+ return ret;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/hypercall-defs.c b/xen/include/hypercall-defs.c
index 47c093acc8..59d7c02f55 100644
--- a/xen/include/hypercall-defs.c
+++ b/xen/include/hypercall-defs.c
@@ -209,6 +209,9 @@ hypfs_op(unsigned int cmd, const char *arg1, unsigned long arg2, void *arg3, uns
#ifdef CONFIG_X86
xenpmu_op(unsigned int op, xen_pmu_params_t *arg)
#endif
+#ifdef CONFIG_HAS_PASSTHROUGH
+iommu_op(unsigned int subop, void *arg)
+#endif
#ifdef CONFIG_PV
caller: pv64
@@ -295,5 +298,8 @@ mca do do - - -
#ifndef CONFIG_PV_SHIM_EXCLUSIVE
paging_domctl_cont do do do do -
#endif
+#ifdef CONFIG_HAS_PASSTHROUGH
+iommu_op do do do do -
+#endif
#endif /* !CPPCHECK */
diff --git a/xen/include/public/pv-iommu.h b/xen/include/public/pv-iommu.h
new file mode 100644
index 0000000000..c14b8435c9
--- /dev/null
+++ b/xen/include/public/pv-iommu.h
@@ -0,0 +1,341 @@
+/* SPDX-License-Identifier: MIT */
+/**
+ * pv-iommu.h
+ *
+ * Paravirtualized IOMMU driver interface.
+ *
+ * Copyright (c) 2024 Teddy Astie <teddy.astie@vates.tech>
+ */
+
+#ifndef __XEN_PUBLIC_PV_IOMMU_H__
+#define __XEN_PUBLIC_PV_IOMMU_H__
+
+#include "xen.h"
+#include "physdev.h"
+
+#ifndef uint64_aligned_t
+#define uint64_aligned_t uint64_t
+#endif
+
+#define IOMMU_DEFAULT_CONTEXT (0)
+
+enum {
+ /* Basic cmd */
+ IOMMU_noop = 0,
+ IOMMU_query_capabilities,
+ IOMMU_init,
+ IOMMU_alloc_context,
+ IOMMU_free_context,
+ IOMMU_reattach_device,
+ IOMMU_map_pages,
+ IOMMU_unmap_pages,
+ IOMMU_remote_cmd,
+
+ /* Extended cmd */
+ IOMMU_alloc_nested, /* if IOMMUCAP_nested */
+ IOMMU_flush_nested, /* if IOMMUCAP_nested */
+ IOMMU_attach_pasid, /* if IOMMUCAP_pasid */
+ IOMMU_detach_pasid, /* if IOMMUCAP_pasid */
+};
+
+/**
+ * Indicate if the default context is a identity mapping to domain memory.
+ * If not defined, default context blocks all DMA to domain memory.
+ */
+#define IOMMUCAP_default_identity (1 << 0)
+
+/**
+ * IOMMU_MAP_cache support.
+ */
+#define IOMMUCAP_cache (1 << 1)
+
+/**
+ * Support for IOMMU_alloc_nested.
+ */
+#define IOMMUCAP_nested (1 << 2)
+
+/**
+ * Support for IOMMU_attach_pasid and IOMMU_detach_pasid and pasid parameter in
+ * reattach_context.
+ */
+#define IOMMUCAP_pasid (1 << 3)
+
+/**
+ * Support for IOMMU_ALLOC_identity
+ */
+#define IOMMUCAP_identity (1 << 4)
+
+/**
+ * IOMMU_query_capabilities
+ * Query PV-IOMMU capabilities for this domain.
+ */
+struct pv_iommu_capabilities {
+ /*
+ * OUT: Maximum device address (iova) that the guest can use for mappings.
+ */
+ uint64_aligned_t max_iova_addr;
+
+ /* OUT: IOMMU capabilities flags */
+ uint32_t cap_flags;
+
+ /* OUT: Mask of all supported page sizes. */
+ uint32_t pgsize_mask;
+
+ /* OUT: Maximum pasid (if IOMMUCAP_pasid) */
+ uint32_t max_pasid;
+
+ /* OUT: Maximum number of IOMMU context this domain can use. */
+ uint16_t max_ctx_no;
+};
+typedef struct pv_iommu_capabilities pv_iommu_capabilities_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_capabilities_t);
+
+/**
+ * IOMMU_init
+ * Initialize PV-IOMMU for this domain.
+ *
+ * Fails with -EACCESS if PV-IOMMU is already initialized.
+ */
+struct pv_iommu_init {
+ /* IN: Maximum number of IOMMU context this domain can use. */
+ uint32_t max_ctx_no;
+
+ /* IN: Arena size in pages (in power of two) */
+ uint32_t arena_order;
+};
+typedef struct pv_iommu_init pv_iommu_init_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_init_t);
+
+/**
+ * Create a 1:1 identity mapped context to domain memory
+ * (needs IOMMUCAP_identity).
+ */
+#define IOMMU_ALLOC_identity (1 << 0)
+
+/**
+ * IOMMU_alloc_context
+ * Allocate an IOMMU context.
+ * Fails with -ENOSPC if no context number is available.
+ */
+struct pv_iommu_alloc {
+ /* OUT: allocated IOMMU context number */
+ uint16_t ctx_no;
+
+ /* IN: allocation flags */
+ uint32_t alloc_flags;
+};
+typedef struct pv_iommu_alloc pv_iommu_alloc_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_alloc_t);
+
+/**
+ * Move all devices to default context before freeing the context.
+ */
+#define IOMMU_FREE_reattach_default (1 << 0)
+
+/**
+ * IOMMU_free_context
+ * Destroy a IOMMU context.
+ *
+ * If IOMMU_FREE_reattach_default is specified, move all context devices to
+ * default context before destroying this context.
+ *
+ * If there are devices in the context and IOMMU_FREE_reattach_default is not
+ * specified, fail with -EBUSY.
+ *
+ * The default context can't be destroyed.
+ */
+struct pv_iommu_free {
+ /* IN: IOMMU context number to free */
+ uint16_t ctx_no;
+
+ /* IN: Free operation specific flags */
+ uint32_t free_flags;
+};
+typedef struct pv_iommu_free pv_iommu_free_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_free_t);
+
+/* Device has read access */
+#define IOMMU_MAP_readable (1 << 0)
+
+/* Device has write access */
+#define IOMMU_MAP_writeable (1 << 1)
+
+/* Enforce DMA coherency */
+#define IOMMU_MAP_cache (1 << 2)
+
+/**
+ * IOMMU_map_pages
+ * Map pages on a IOMMU context.
+ *
+ * pgsize must be supported by pgsize_mask.
+ * Fails with -EINVAL if mapping on top of another mapping.
+ * Report actually mapped page count in mapped field (regardless of failure).
+ */
+struct pv_iommu_map_pages {
+ /* IN: IOMMU context number */
+ uint16_t ctx_no;
+
+ /* IN: Guest frame number */
+ uint64_aligned_t gfn;
+
+ /* IN: Device frame number */
+ uint64_aligned_t dfn;
+
+ /* IN: Map flags */
+ uint32_t map_flags;
+
+ /* IN: Size of pages to map */
+ uint32_t pgsize;
+
+ /* IN: Number of pages to map */
+ uint32_t nr_pages;
+
+ /* OUT: Number of pages actually mapped */
+ uint32_t mapped;
+};
+typedef struct pv_iommu_map_pages pv_iommu_map_pages_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_map_pages_t);
+
+/**
+ * IOMMU_unmap_pages
+ * Unmap pages on a IOMMU context.
+ *
+ * pgsize must be supported by pgsize_mask.
+ * Report actually unmapped page count in mapped field (regardless of failure).
+ * Fails with -ENOENT when attempting to unmap a page without any mapping
+ */
+struct pv_iommu_unmap_pages {
+ /* IN: IOMMU context number */
+ uint16_t ctx_no;
+
+ /* IN: Device frame number */
+ uint64_aligned_t dfn;
+
+ /* IN: Size of pages to unmap */
+ uint32_t pgsize;
+
+ /* IN: Number of pages to unmap */
+ uint32_t nr_pages;
+
+ /* OUT: Number of pages actually unmapped */
+ uint32_t unmapped;
+};
+typedef struct pv_iommu_unmap_pages pv_iommu_unmap_pages_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_unmap_pages_t);
+
+/**
+ * IOMMU_reattach_device
+ * Reattach a device to another IOMMU context.
+ * Fails with -ENODEV if no such device exist.
+ */
+struct pv_iommu_reattach_device {
+ /* IN: Target IOMMU context number */
+ uint16_t ctx_no;
+
+ /* IN: Physical device to move */
+ struct physdev_pci_device dev;
+
+ /* IN: PASID of the device (if IOMMUCAP_pasid) */
+ uint32_t pasid;
+};
+typedef struct pv_iommu_reattach_device pv_iommu_reattach_device_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_reattach_device_t);
+
+
+/**
+ * IOMMU_remote_cmd
+ * Do a PV-IOMMU operation on another domain.
+ * Current domain needs to be allowed to act on the target domain, otherwise
+ * fails with -EPERM.
+ */
+struct pv_iommu_remote_cmd {
+ /* IN: Target domain to do the subop on */
+ uint16_t domid;
+
+ /* IN: Command to do on target domain. */
+ uint16_t subop;
+
+ /* INOUT: Command argument from current domain memory */
+ XEN_GUEST_HANDLE(void) arg;
+};
+typedef struct pv_iommu_remote_cmd pv_iommu_remote_cmd_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_remote_cmd_t);
+
+/**
+ * IOMMU_alloc_nested
+ * Create a nested IOMMU context (needs IOMMUCAP_nested).
+ *
+ * This context uses a platform-specific page table from domain address space
+ * specified in pgtable_gfn and use it for nested translations.
+ *
+ * Explicit flushes needs to be submited with IOMMU_flush_nested on
+ * modification of the nested pagetable to ensure coherency between IOTLB and
+ * nested page table.
+ *
+ * This context can be destroyed using IOMMU_free_context.
+ * This context cannot be modified using map_pages, unmap_pages.
+ */
+struct pv_iommu_alloc_nested {
+ /* OUT: allocated IOMMU context number */
+ uint16_t ctx_no;
+
+ /* IN: guest frame number of the nested page table */
+ uint64_aligned_t pgtable_gfn;
+
+ /* IN: nested mode flags */
+ uint64_aligned_t nested_flags;
+};
+typedef struct pv_iommu_alloc_nested pv_iommu_alloc_nested_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_alloc_nested_t);
+
+/**
+ * IOMMU_flush_nested (needs IOMMUCAP_nested)
+ * Flush the IOTLB for nested translation.
+ */
+struct pv_iommu_flush_nested {
+ /* TODO */
+};
+typedef struct pv_iommu_flush_nested pv_iommu_flush_nested_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_flush_nested_t);
+
+/**
+ * IOMMU_attach_pasid (needs IOMMUCAP_pasid)
+ * Attach a new device-with-pasid to a IOMMU context.
+ * If a matching device-with-pasid already exists (globally),
+ * fail with -EEXIST.
+ * If pasid is 0, fails with -EINVAL.
+ * If physical device doesn't exist in domain, fail with -ENOENT.
+ */
+struct pv_iommu_attach_pasid {
+ /* IN: IOMMU context to add the device-with-pasid in */
+ uint16_t ctx_no;
+
+ /* IN: Physical device */
+ struct physdev_pci_device dev;
+
+ /* IN: pasid of the device to attach */
+ uint32_t pasid;
+};
+typedef struct pv_iommu_attach_pasid pv_iommu_attach_pasid_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_attach_pasid_t);
+
+/**
+ * IOMMU_detach_pasid (needs IOMMUCAP_pasid)
+ * detach a device-with-pasid.
+ * If the device-with-pasid doesn't exist or belong to the domain,
+ * fail with -ENOENT.
+ * If pasid is 0, fails with -EINVAL.
+ */
+struct pv_iommu_detach_pasid {
+ /* IN: Physical device */
+ struct physdev_pci_device dev;
+
+ /* pasid of the device to detach */
+ uint32_t pasid;
+};
+typedef struct pv_iommu_detach_pasid pv_iommu_detach_pasid_t;
+DEFINE_XEN_GUEST_HANDLE(pv_iommu_detach_pasid_t);
+
+/* long do_iommu_op(int subop, XEN_GUEST_HANDLE_PARAM(void) arg) */
+
+#endif
\ No newline at end of file
diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h
index b47d48d0e2..28ab815ebc 100644
--- a/xen/include/public/xen.h
+++ b/xen/include/public/xen.h
@@ -118,6 +118,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_ulong_t);
#define __HYPERVISOR_xenpmu_op 40
#define __HYPERVISOR_dm_op 41
#define __HYPERVISOR_hypfs_op 42
+#define __HYPERVISOR_iommu_op 43
/* Architecture-specific hypercall definitions. */
#define __HYPERVISOR_arch_0 48
--
2.45.2
Teddy Astie | Vates XCP-ng Developer
XCP-ng & Xen Orchestra - Vates solutions
web: https://vates.tech
Hi, A bunch of mostly ABI-related minor comments. Mostly about adding implicit padding. Still reading through the logic, but this should be helpful as is so I'm sending it early. On Mon Nov 4, 2024 at 2:28 PM GMT, Teddy Astie wrote: > Introduce a new pv interface to manage the underlying IOMMU and manage contexts > and devices. This interface allows creation of new contexts from Dom0 and > addition of IOMMU mappings using guest PoV. > > This interface doesn't allow creation of mapping to other domains. > > Signed-off-by Teddy Astie <teddy.astie@vates.tech> > --- > Changed in V2: > * formatting > > Changed in V3: > * prevent IOMMU operations on dying contexts > > Changed in V4: > * redesigned hypercall interface [1] > * added remote_cmd and init logic > > [1] https://lore.kernel.org/all/fdfa32c9-c177-4d05-891a-138f9b663f19@vates.tech/ > --- > xen/common/Makefile | 1 + > xen/common/pv-iommu.c | 539 ++++++++++++++++++++++++++++++++++ > xen/include/hypercall-defs.c | 6 + > xen/include/public/pv-iommu.h | 341 +++++++++++++++++++++ > xen/include/public/xen.h | 1 + > 5 files changed, 888 insertions(+) > create mode 100644 xen/common/pv-iommu.c > create mode 100644 xen/include/public/pv-iommu.h > [snip] > diff --git a/xen/include/public/pv-iommu.h b/xen/include/public/pv-iommu.h > new file mode 100644 > index 0000000000..c14b8435c9 > --- /dev/null > +++ b/xen/include/public/pv-iommu.h > @@ -0,0 +1,341 @@ > +/* SPDX-License-Identifier: MIT */ > +/** > + * pv-iommu.h > + * > + * Paravirtualized IOMMU driver interface. > + * > + * Copyright (c) 2024 Teddy Astie <teddy.astie@vates.tech> > + */ > + > +#ifndef __XEN_PUBLIC_PV_IOMMU_H__ > +#define __XEN_PUBLIC_PV_IOMMU_H__ > + > +#include "xen.h" > +#include "physdev.h" > + > +#ifndef uint64_aligned_t > +#define uint64_aligned_t uint64_t > +#endif Why did you need this? This header ought to depend on whatever implements uint64_aligned_t, which is most definitely not a simple uint64_t on 32bit machines. > + > +#define IOMMU_DEFAULT_CONTEXT (0) > + > +enum { nit: If you give the enum a fully qualified name (e.g: iommucmd), then you can add a comment on the field in the struct saying "See iommucmd" and folks will be able to follow the tag back here via cscope/ctags/global/whatever. > + /* Basic cmd */ > + IOMMU_noop = 0, > + IOMMU_query_capabilities, nit: I think we want "= N" for every cmd. Otherwise it's very annoying to find out which variant has which number as the number of variants increases. Also makes it (a bit) more annoying to deprecate numbers. > + IOMMU_init, > + IOMMU_alloc_context, > + IOMMU_free_context, > + IOMMU_reattach_device, > + IOMMU_map_pages, > + IOMMU_unmap_pages, > + IOMMU_remote_cmd, > + > + /* Extended cmd */ > + IOMMU_alloc_nested, /* if IOMMUCAP_nested */ > + IOMMU_flush_nested, /* if IOMMUCAP_nested */ > + IOMMU_attach_pasid, /* if IOMMUCAP_pasid */ > + IOMMU_detach_pasid, /* if IOMMUCAP_pasid */ > +}; > + > +/** > + * Indicate if the default context is a identity mapping to domain memory. > + * If not defined, default context blocks all DMA to domain memory. > + */ nit: s/is a/is an/. Though seeing how it's a bit in a bitmap it might make more sense to rephrase it as "if set: X; if cleared: Y". The bit is always defined, it's its state that signals a boolean property. > +#define IOMMUCAP_default_identity (1 << 0) This one, and all others want a U suffix (i.e: 1U << 0). Otherwise things like 1 << 31 (if we ever get to that) will misbehave. > + > +/** > + * IOMMU_MAP_cache support. > + */ > +#define IOMMUCAP_cache (1 << 1) > + > +/** > + * Support for IOMMU_alloc_nested. > + */ > +#define IOMMUCAP_nested (1 << 2) > + > +/** > + * Support for IOMMU_attach_pasid and IOMMU_detach_pasid and pasid parameter in > + * reattach_context. > + */ > +#define IOMMUCAP_pasid (1 << 3) > + > +/** > + * Support for IOMMU_ALLOC_identity > + */ > +#define IOMMUCAP_identity (1 << 4) > + > +/** > + * IOMMU_query_capabilities > + * Query PV-IOMMU capabilities for this domain. > + */ > +struct pv_iommu_capabilities { > + /* > + * OUT: Maximum device address (iova) that the guest can use for mappings. > + */ > + uint64_aligned_t max_iova_addr; > + > + /* OUT: IOMMU capabilities flags */ > + uint32_t cap_flags; > + > + /* OUT: Mask of all supported page sizes. */ > + uint32_t pgsize_mask; > + > + /* OUT: Maximum pasid (if IOMMUCAP_pasid) */ > + uint32_t max_pasid; > + > + /* OUT: Maximum number of IOMMU context this domain can use. */ > + uint16_t max_ctx_no; Missing u16 padding at the tail > +}; > +typedef struct pv_iommu_capabilities pv_iommu_capabilities_t; > +DEFINE_XEN_GUEST_HANDLE(pv_iommu_capabilities_t); > + > +/** > + * IOMMU_init > + * Initialize PV-IOMMU for this domain. > + * > + * Fails with -EACCESS if PV-IOMMU is already initialized. > + */ > +struct pv_iommu_init { > + /* IN: Maximum number of IOMMU context this domain can use. */ > + uint32_t max_ctx_no; > + > + /* IN: Arena size in pages (in power of two) */ > + uint32_t arena_order; > +}; > +typedef struct pv_iommu_init pv_iommu_init_t; > +DEFINE_XEN_GUEST_HANDLE(pv_iommu_init_t); > + > +/** > + * Create a 1:1 identity mapped context to domain memory > + * (needs IOMMUCAP_identity). > + */ > +#define IOMMU_ALLOC_identity (1 << 0) Same consideration as above. Wants to be 1U. > + > +/** > + * IOMMU_alloc_context > + * Allocate an IOMMU context. > + * Fails with -ENOSPC if no context number is available. > + */ > +struct pv_iommu_alloc { > + /* OUT: allocated IOMMU context number */ > + uint16_t ctx_no; Missing u16 padding > + > + /* IN: allocation flags */ > + uint32_t alloc_flags; > +}; > +typedef struct pv_iommu_alloc pv_iommu_alloc_t; > +DEFINE_XEN_GUEST_HANDLE(pv_iommu_alloc_t); > + > +/** > + * Move all devices to default context before freeing the context. > + */ > +#define IOMMU_FREE_reattach_default (1 << 0) > + > +/** > + * IOMMU_free_context > + * Destroy a IOMMU context. > + * > + * If IOMMU_FREE_reattach_default is specified, move all context devices to > + * default context before destroying this context. > + * > + * If there are devices in the context and IOMMU_FREE_reattach_default is not > + * specified, fail with -EBUSY. > + * > + * The default context can't be destroyed. > + */ > +struct pv_iommu_free { > + /* IN: IOMMU context number to free */ > + uint16_t ctx_no; missing u16 padding > + > + /* IN: Free operation specific flags */ > + uint32_t free_flags; > +}; > +typedef struct pv_iommu_free pv_iommu_free_t; > +DEFINE_XEN_GUEST_HANDLE(pv_iommu_free_t); > + > +/* Device has read access */ > +#define IOMMU_MAP_readable (1 << 0) > + > +/* Device has write access */ > +#define IOMMU_MAP_writeable (1 << 1) > + > +/* Enforce DMA coherency */ > +#define IOMMU_MAP_cache (1 << 2) > + > +/** > + * IOMMU_map_pages > + * Map pages on a IOMMU context. > + * > + * pgsize must be supported by pgsize_mask. > + * Fails with -EINVAL if mapping on top of another mapping. > + * Report actually mapped page count in mapped field (regardless of failure). > + */ > +struct pv_iommu_map_pages { > + /* IN: IOMMU context number */ > + uint16_t ctx_no; missing 6 octets worth of padding. > + > + /* IN: Guest frame number */ > + uint64_aligned_t gfn; > + > + /* IN: Device frame number */ > + uint64_aligned_t dfn; > + > + /* IN: Map flags */ > + uint32_t map_flags; > + > + /* IN: Size of pages to map */ > + uint32_t pgsize; > + > + /* IN: Number of pages to map */ > + uint32_t nr_pages; > + > + /* OUT: Number of pages actually mapped */ > + uint32_t mapped; > +}; > +typedef struct pv_iommu_map_pages pv_iommu_map_pages_t; > +DEFINE_XEN_GUEST_HANDLE(pv_iommu_map_pages_t); > + > +/** > + * IOMMU_unmap_pages > + * Unmap pages on a IOMMU context. > + * > + * pgsize must be supported by pgsize_mask. > + * Report actually unmapped page count in mapped field (regardless of failure). > + * Fails with -ENOENT when attempting to unmap a page without any mapping > + */ > +struct pv_iommu_unmap_pages { > + /* IN: IOMMU context number */ > + uint16_t ctx_no; missing 6 octets worth of padding. There's little reason for it to be the first member though, afaics. Which means it could move to the end and use part of the existing padding there instead... > + > + /* IN: Device frame number */ > + uint64_aligned_t dfn; > + > + /* IN: Size of pages to unmap */ > + uint32_t pgsize; > + > + /* IN: Number of pages to unmap */ > + uint32_t nr_pages; > + > + /* OUT: Number of pages actually unmapped */ > + uint32_t unmapped; ... (which is a u32). We'd go from 10 octets worth of padding to a single u16. > +}; > +typedef struct pv_iommu_unmap_pages pv_iommu_unmap_pages_t; > +DEFINE_XEN_GUEST_HANDLE(pv_iommu_unmap_pages_t); > + > +/** > + * IOMMU_reattach_device > + * Reattach a device to another IOMMU context. > + * Fails with -ENODEV if no such device exist. > + */ > +struct pv_iommu_reattach_device { > + /* IN: Target IOMMU context number */ > + uint16_t ctx_no; > + > + /* IN: Physical device to move */ > + struct physdev_pci_device dev; Missing u16 padding , I think? > + > + /* IN: PASID of the device (if IOMMUCAP_pasid) */ > + uint32_t pasid; > +}; > +typedef struct pv_iommu_reattach_device pv_iommu_reattach_device_t; > +DEFINE_XEN_GUEST_HANDLE(pv_iommu_reattach_device_t); > + > + > +/** > + * IOMMU_remote_cmd > + * Do a PV-IOMMU operation on another domain. > + * Current domain needs to be allowed to act on the target domain, otherwise > + * fails with -EPERM. > + */ > +struct pv_iommu_remote_cmd { > + /* IN: Target domain to do the subop on */ > + uint16_t domid; > + > + /* IN: Command to do on target domain. */ > + uint16_t subop; missing u32 padding. > + > + /* INOUT: Command argument from current domain memory */ > + XEN_GUEST_HANDLE(void) arg; > +}; > +typedef struct pv_iommu_remote_cmd pv_iommu_remote_cmd_t; > +DEFINE_XEN_GUEST_HANDLE(pv_iommu_remote_cmd_t); > + > +/** > + * IOMMU_alloc_nested > + * Create a nested IOMMU context (needs IOMMUCAP_nested). > + * > + * This context uses a platform-specific page table from domain address space > + * specified in pgtable_gfn and use it for nested translations. > + * > + * Explicit flushes needs to be submited with IOMMU_flush_nested on > + * modification of the nested pagetable to ensure coherency between IOTLB and > + * nested page table. > + * > + * This context can be destroyed using IOMMU_free_context. > + * This context cannot be modified using map_pages, unmap_pages. > + */ > +struct pv_iommu_alloc_nested { > + /* OUT: allocated IOMMU context number */ > + uint16_t ctx_no; missing 6 octets worth of padding. nit: For this one and others, might be more helpful to have it at the tail to allow more flexibility in how to use that padding in the future. > + > + /* IN: guest frame number of the nested page table */ > + uint64_aligned_t pgtable_gfn; > + > + /* IN: nested mode flags */ > + uint64_aligned_t nested_flags; > +}; > +typedef struct pv_iommu_alloc_nested pv_iommu_alloc_nested_t; > +DEFINE_XEN_GUEST_HANDLE(pv_iommu_alloc_nested_t); > + > +/** > + * IOMMU_flush_nested (needs IOMMUCAP_nested) > + * Flush the IOTLB for nested translation. > + */ > +struct pv_iommu_flush_nested { > + /* TODO */ > +}; > +typedef struct pv_iommu_flush_nested pv_iommu_flush_nested_t; > +DEFINE_XEN_GUEST_HANDLE(pv_iommu_flush_nested_t); > + > +/** > + * IOMMU_attach_pasid (needs IOMMUCAP_pasid) > + * Attach a new device-with-pasid to a IOMMU context. > + * If a matching device-with-pasid already exists (globally), > + * fail with -EEXIST. > + * If pasid is 0, fails with -EINVAL. > + * If physical device doesn't exist in domain, fail with -ENOENT. > + */ > +struct pv_iommu_attach_pasid { > + /* IN: IOMMU context to add the device-with-pasid in */ > + uint16_t ctx_no; > + > + /* IN: Physical device */ > + struct physdev_pci_device dev; This subfield is non-obviously 16bit aligned (and we can't change it because it's already part of the ABI). This means (I think), there's a missing u16 padding here. > + > + /* IN: pasid of the device to attach */ > + uint32_t pasid; > +}; > +typedef struct pv_iommu_attach_pasid pv_iommu_attach_pasid_t; > +DEFINE_XEN_GUEST_HANDLE(pv_iommu_attach_pasid_t); > + > +/** > + * IOMMU_detach_pasid (needs IOMMUCAP_pasid) > + * detach a device-with-pasid. > + * If the device-with-pasid doesn't exist or belong to the domain, > + * fail with -ENOENT. > + * If pasid is 0, fails with -EINVAL. > + */ > +struct pv_iommu_detach_pasid { > + /* IN: Physical device */ > + struct physdev_pci_device dev; > + > + /* pasid of the device to detach */ > + uint32_t pasid; > +}; > +typedef struct pv_iommu_detach_pasid pv_iommu_detach_pasid_t; > +DEFINE_XEN_GUEST_HANDLE(pv_iommu_detach_pasid_t); > + > +/* long do_iommu_op(int subop, XEN_GUEST_HANDLE_PARAM(void) arg) */ > + > +#endif > \ No newline at end of file > diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h > index b47d48d0e2..28ab815ebc 100644 > --- a/xen/include/public/xen.h > +++ b/xen/include/public/xen.h > @@ -118,6 +118,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_ulong_t); > #define __HYPERVISOR_xenpmu_op 40 > #define __HYPERVISOR_dm_op 41 > #define __HYPERVISOR_hypfs_op 42 > +#define __HYPERVISOR_iommu_op 43 > > /* Architecture-specific hypercall definitions. */ > #define __HYPERVISOR_arch_0 48
© 2016 - 2024 Red Hat, Inc.