arch/x86/include/asm/xen/hypercall.h | 6 + drivers/iommu/Kconfig | 9 + drivers/iommu/Makefile | 1 + drivers/iommu/xen-iommu.c | 508 +++++++++++++++++++++++++++ include/xen/interface/memory.h | 33 ++ include/xen/interface/pv-iommu.h | 114 ++++++ include/xen/interface/xen.h | 1 + 7 files changed, 672 insertions(+) create mode 100644 drivers/iommu/xen-iommu.c create mode 100644 include/xen/interface/pv-iommu.h
In the context of Xen, Linux runs as Dom0 and doesn't have access to the
machine IOMMU. Although, a IOMMU is mandatory to use some kernel features
such as VFIO or DMA protection.
In Xen, we added a paravirtualized IOMMU with iommu_op hypercall in order
to allow Dom0 to implement such feature. This commit introduces a new
IOMMU driver that uses this new hypercall interface.
Signed-off-by Teddy Astie <teddy.astie@vates.tech>
---
arch/x86/include/asm/xen/hypercall.h | 6 +
drivers/iommu/Kconfig | 9 +
drivers/iommu/Makefile | 1 +
drivers/iommu/xen-iommu.c | 508 +++++++++++++++++++++++++++
include/xen/interface/memory.h | 33 ++
include/xen/interface/pv-iommu.h | 114 ++++++
include/xen/interface/xen.h | 1 +
7 files changed, 672 insertions(+)
create mode 100644 drivers/iommu/xen-iommu.c
create mode 100644 include/xen/interface/pv-iommu.h
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index a2dd24947eb8..6b1857f27c14 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -490,6 +490,12 @@ HYPERVISOR_xenpmu_op(unsigned int op, void *arg)
return _hypercall2(int, xenpmu_op, op, arg);
}
+static inline int
+HYPERVISOR_iommu_op(void *arg)
+{
+ return _hypercall1(int, iommu_op, arg);
+}
+
static inline int
HYPERVISOR_dm_op(
domid_t dom, unsigned int nr_bufs, struct xen_dm_op_buf *bufs)
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 2b12b583ef4b..8d8a22b91e34 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -482,6 +482,15 @@ config VIRTIO_IOMMU
Say Y here if you intend to run this kernel as a guest.
+config XEN_IOMMU
+ bool "Xen IOMMU driver"
+ depends on XEN_DOM0
+ select IOMMU_API
+ help
+ Xen PV-IOMMU driver for Dom0.
+
+ Say Y here if you intend to run this guest as Xen Dom0.
+
config SPRD_IOMMU
tristate "Unisoc IOMMU Support"
depends on ARCH_SPRD || COMPILE_TEST
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 769e43d780ce..11fa258d3a04 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -30,3 +30,4 @@ obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o io-pgfault.o
obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o
obj-$(CONFIG_APPLE_DART) += apple-dart.o
+obj-$(CONFIG_XEN_IOMMU) += xen-iommu.o
\ No newline at end of file
diff --git a/drivers/iommu/xen-iommu.c b/drivers/iommu/xen-iommu.c
new file mode 100644
index 000000000000..2c8e42240a6b
--- /dev/null
+++ b/drivers/iommu/xen-iommu.c
@@ -0,0 +1,508 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Xen PV-IOMMU driver.
+ *
+ * Copyright (C) 2024 Vates SAS
+ *
+ * Author: Teddy Astie <teddy.astie@vates.tech>
+ *
+ */
+
+#define pr_fmt(fmt) "xen-iommu: " fmt
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/iommu.h>
+#include <linux/dma-map-ops.h>
+#include <linux/pci.h>
+#include <linux/list.h>
+#include <linux/string.h>
+#include <linux/device/driver.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/printk.h>
+#include <linux/stddef.h>
+#include <linux/spinlock.h>
+#include <linux/minmax.h>
+#include <linux/string.h>
+#include <asm/iommu.h>
+
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/physdev.h>
+#include <xen/interface/pv-iommu.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/page.h>
+
+MODULE_DESCRIPTION("Xen IOMMU driver");
+MODULE_AUTHOR("Teddy Astie <teddy.astie@vates.tech>");
+MODULE_LICENSE("GPL");
+
+#define MSI_RANGE_START (0xfee00000)
+#define MSI_RANGE_END (0xfeefffff)
+
+#define XEN_IOMMU_PGSIZES (0x1000)
+
+struct xen_iommu_domain {
+ struct iommu_domain domain;
+
+ u16 ctx_no; /* Xen PV-IOMMU context number */
+};
+
+static struct iommu_device xen_iommu_device;
+
+static uint32_t max_nr_pages;
+static uint64_t max_iova_addr;
+
+static spinlock_t lock;
+
+static inline struct xen_iommu_domain *to_xen_iommu_domain(struct iommu_domain *dom)
+{
+ return container_of(dom, struct xen_iommu_domain, domain);
+}
+
+static inline u64 addr_to_pfn(u64 addr)
+{
+ return addr >> 12;
+}
+
+static inline u64 pfn_to_addr(u64 pfn)
+{
+ return pfn << 12;
+}
+
+bool xen_iommu_capable(struct device *dev, enum iommu_cap cap)
+{
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+struct iommu_domain *xen_iommu_domain_alloc(unsigned type)
+{
+ struct xen_iommu_domain *domain;
+ u16 ctx_no;
+ int ret;
+
+ if (type & IOMMU_DOMAIN_IDENTITY) {
+ /* use default domain */
+ ctx_no = 0;
+ } else {
+ struct pv_iommu_op op = {
+ .ctx_no = 0,
+ .flags = 0,
+ .subop_id = IOMMUOP_alloc_context
+ };
+
+ ret = HYPERVISOR_iommu_op(&op);
+
+ if (ret) {
+ pr_err("Unable to create Xen IOMMU context (%d)", ret);
+ return ERR_PTR(ret);
+ }
+
+ ctx_no = op.ctx_no;
+ }
+
+ domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+
+ domain->ctx_no = ctx_no;
+
+ domain->domain.geometry.aperture_start = 0;
+
+ domain->domain.geometry.aperture_end = max_iova_addr;
+ domain->domain.geometry.force_aperture = true;
+
+ return &domain->domain;
+}
+
+static struct iommu_group *xen_iommu_device_group(struct device *dev)
+{
+ if (!dev_is_pci(dev))
+ return ERR_PTR(-ENODEV);
+
+ return pci_device_group(dev);
+}
+
+static struct iommu_device *xen_iommu_probe_device(struct device *dev)
+{
+ if (!dev_is_pci(dev))
+ return ERR_PTR(-ENODEV);
+
+ return &xen_iommu_device;
+}
+
+static void xen_iommu_probe_finalize(struct device *dev)
+{
+ set_dma_ops(dev, NULL);
+ iommu_setup_dma_ops(dev, 0, max_iova_addr);
+}
+
+static void xen_iommu_release_device(struct device *dev)
+{
+ int ret;
+ struct pci_dev *pdev;
+ struct pv_iommu_op op = {
+ .subop_id = IOMMUOP_reattach_device,
+ .flags = 0,
+ .ctx_no = 0 /* reattach device back to default context */
+ };
+
+ if (!dev_is_pci(dev))
+ return;
+
+ pdev = to_pci_dev(dev);
+
+ op.reattach_device.dev.seg = pci_domain_nr(pdev->bus);
+ op.reattach_device.dev.bus = pdev->bus->number;
+ op.reattach_device.dev.devfn = pdev->devfn;
+
+ ret = HYPERVISOR_iommu_op(&op);
+
+ if (ret)
+ pr_warn("Unable to release device %p\n", &op.reattach_device.dev);
+}
+
+static int xen_iommu_map_pages(struct iommu_domain *domain, unsigned long iova,
+ phys_addr_t paddr, size_t pgsize, size_t pgcount,
+ int prot, gfp_t gfp, size_t *mapped)
+{
+ size_t xen_pg_count = (pgsize / XEN_PAGE_SIZE) * pgcount;
+ struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
+ struct pv_iommu_op op = {
+ .subop_id = IOMMUOP_map_pages,
+ .flags = 0,
+ .ctx_no = dom->ctx_no
+ };
+ /* NOTE: paddr is actually bound to pfn, not gfn */
+ uint64_t pfn = addr_to_pfn(paddr);
+ uint64_t dfn = addr_to_pfn(iova);
+ int ret = 0;
+
+ if (WARN(!dom->ctx_no, "Tried to map page to default context"))
+ return -EINVAL;
+
+ //pr_info("Mapping to %lx %zu %zu paddr %x\n", iova, pgsize, pgcount, paddr);
+
+ if (prot & IOMMU_READ)
+ op.flags |= IOMMU_OP_readable;
+
+ if (prot & IOMMU_WRITE)
+ op.flags |= IOMMU_OP_writeable;
+
+ while (xen_pg_count) {
+ size_t to_map = min(xen_pg_count, max_nr_pages);
+ uint64_t gfn = pfn_to_gfn(pfn);
+
+ //pr_info("Mapping %lx-%lx at %lx-%lx\n", gfn, gfn + to_map - 1, dfn, dfn + to_map - 1);
+
+ op.map_pages.gfn = gfn;
+ op.map_pages.dfn = dfn;
+
+ op.map_pages.nr_pages = to_map;
+
+ ret = HYPERVISOR_iommu_op(&op);
+
+ //pr_info("map_pages.mapped = %u\n", op.map_pages.mapped);
+
+ if (mapped)
+ *mapped += XEN_PAGE_SIZE * op.map_pages.mapped;
+
+ if (ret)
+ break;
+
+ xen_pg_count -= to_map;
+
+ pfn += to_map;
+ dfn += to_map;
+ }
+
+ return ret;
+}
+
+static size_t xen_iommu_unmap_pages(struct iommu_domain *domain, unsigned long iova,
+ size_t pgsize, size_t pgcount,
+ struct iommu_iotlb_gather *iotlb_gather)
+{
+ size_t xen_pg_count = (pgsize / XEN_PAGE_SIZE) * pgcount;
+ struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
+ struct pv_iommu_op op = {
+ .subop_id = IOMMUOP_unmap_pages,
+ .ctx_no = dom->ctx_no,
+ .flags = 0,
+ };
+ uint64_t dfn = addr_to_pfn(iova);
+ int ret = 0;
+
+ if (WARN(!dom->ctx_no, "Tried to unmap page to default context"))
+ return -EINVAL;
+
+ while (xen_pg_count) {
+ size_t to_unmap = min(xen_pg_count, max_nr_pages);
+
+ //pr_info("Unmapping %lx-%lx\n", dfn, dfn + to_unmap - 1);
+
+ op.unmap_pages.dfn = dfn;
+ op.unmap_pages.nr_pages = to_unmap;
+
+ ret = HYPERVISOR_iommu_op(&op);
+
+ if (ret)
+ pr_warn("Unmap failure (%lx-%lx)\n", dfn, dfn + to_unmap - 1);
+
+ xen_pg_count -= to_unmap;
+
+ dfn += to_unmap;
+ }
+
+ return pgcount * pgsize;
+}
+
+int xen_iommu_attach_dev(struct iommu_domain *domain, struct device *dev)
+{
+ struct pci_dev *pdev;
+ struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
+ struct pv_iommu_op op = {
+ .subop_id = IOMMUOP_reattach_device,
+ .flags = 0,
+ .ctx_no = dom->ctx_no,
+ };
+
+ if (!dev_is_pci(dev))
+ return -EINVAL;
+
+ pdev = to_pci_dev(dev);
+
+ op.reattach_device.dev.seg = pci_domain_nr(pdev->bus);
+ op.reattach_device.dev.bus = pdev->bus->number;
+ op.reattach_device.dev.devfn = pdev->devfn;
+
+ return HYPERVISOR_iommu_op(&op);
+}
+
+static void xen_iommu_free(struct iommu_domain *domain)
+{
+ int ret;
+ struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
+
+ if (dom->ctx_no != 0) {
+ struct pv_iommu_op op = {
+ .ctx_no = dom->ctx_no,
+ .flags = 0,
+ .subop_id = IOMMUOP_free_context
+ };
+
+ ret = HYPERVISOR_iommu_op(&op);
+
+ if (ret)
+ pr_err("Context %hu destruction failure\n", dom->ctx_no);
+ }
+
+ kfree(domain);
+}
+
+static phys_addr_t xen_iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
+{
+ int ret;
+ struct xen_iommu_domain *dom = to_xen_iommu_domain(domain);
+
+ struct pv_iommu_op op = {
+ .ctx_no = dom->ctx_no,
+ .flags = 0,
+ .subop_id = IOMMUOP_lookup_page,
+ };
+
+ op.lookup_page.dfn = addr_to_pfn(iova);
+
+ ret = HYPERVISOR_iommu_op(&op);
+
+ if (ret)
+ return 0;
+
+ phys_addr_t page_addr = pfn_to_addr(gfn_to_pfn(op.lookup_page.gfn));
+
+ /* Consider non-aligned iova */
+ return page_addr + (iova & 0xFFF);
+}
+
+static void xen_iommu_get_resv_regions(struct device *dev, struct list_head *head)
+{
+ struct iommu_resv_region *reg;
+ struct xen_reserved_device_memory *entries;
+ struct xen_reserved_device_memory_map map;
+ struct pci_dev *pdev;
+ int ret, i;
+
+ if (!dev_is_pci(dev))
+ return;
+
+ pdev = to_pci_dev(dev);
+
+ reg = iommu_alloc_resv_region(MSI_RANGE_START,
+ MSI_RANGE_END - MSI_RANGE_START + 1,
+ 0, IOMMU_RESV_MSI, GFP_KERNEL);
+
+ if (!reg)
+ return;
+
+ list_add_tail(®->list, head);
+
+ /* Map xen-specific entries */
+
+ /* First, get number of entries to map */
+ map.buffer = NULL;
+ map.nr_entries = 0;
+ map.flags = 0;
+
+ map.dev.pci.seg = pci_domain_nr(pdev->bus);
+ map.dev.pci.bus = pdev->bus->number;
+ map.dev.pci.devfn = pdev->devfn;
+
+ ret = HYPERVISOR_memory_op(XENMEM_reserved_device_memory_map, &map);
+
+ if (ret == 0)
+ /* No reserved region, nothing to do */
+ return;
+
+ if (ret != -ENOBUFS) {
+ pr_err("Unable to get reserved region count (%d)\n", ret);
+ return;
+ }
+
+ /* Assume a reasonable number of entries, otherwise, something is probably wrong */
+ if (WARN_ON(map.nr_entries > 256))
+ pr_warn("Xen reporting many reserved regions (%u)\n", map.nr_entries);
+
+ /* And finally get actual mappings */
+ entries = kcalloc(map.nr_entries, sizeof(struct xen_reserved_device_memory),
+ GFP_KERNEL);
+
+ if (!entries) {
+ pr_err("No memory for map entries\n");
+ return;
+ }
+
+ map.buffer = entries;
+
+ ret = HYPERVISOR_memory_op(XENMEM_reserved_device_memory_map, &map);
+
+ if (ret != 0) {
+ pr_err("Unable to get reserved regions (%d)\n", ret);
+ kfree(entries);
+ return;
+ }
+
+ for (i = 0; i < map.nr_entries; i++) {
+ struct xen_reserved_device_memory entry = entries[i];
+
+ reg = iommu_alloc_resv_region(pfn_to_addr(entry.start_pfn),
+ pfn_to_addr(entry.nr_pages),
+ 0, IOMMU_RESV_RESERVED, GFP_KERNEL);
+
+ if (!reg)
+ break;
+
+ list_add_tail(®->list, head);
+ }
+
+ kfree(entries);
+}
+
+static struct iommu_ops xen_iommu_ops = {
+ .capable = xen_iommu_capable,
+ .domain_alloc = xen_iommu_domain_alloc,
+ .probe_device = xen_iommu_probe_device,
+ .probe_finalize = xen_iommu_probe_finalize,
+ .device_group = xen_iommu_device_group,
+ .release_device = xen_iommu_release_device,
+ .get_resv_regions = xen_iommu_get_resv_regions,
+ .pgsize_bitmap = XEN_IOMMU_PGSIZES,
+ .default_domain_ops = &(const struct iommu_domain_ops) {
+ .map_pages = xen_iommu_map_pages,
+ .unmap_pages = xen_iommu_unmap_pages,
+ .attach_dev = xen_iommu_attach_dev,
+ .iova_to_phys = xen_iommu_iova_to_phys,
+ .free = xen_iommu_free,
+ },
+};
+
+int __init xen_iommu_init(void)
+{
+ int ret;
+ struct pv_iommu_op op = {
+ .subop_id = IOMMUOP_query_capabilities
+ };
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ /* Check if iommu_op is supported */
+ if (HYPERVISOR_iommu_op(&op) == -ENOSYS)
+ return -ENODEV; /* No Xen IOMMU hardware */
+
+ pr_info("Initialising Xen IOMMU driver\n");
+ pr_info("max_nr_pages=%d\n", op.cap.max_nr_pages);
+ pr_info("max_ctx_no=%d\n", op.cap.max_ctx_no);
+ pr_info("max_iova_addr=%llx\n", op.cap.max_iova_addr);
+
+ if (op.cap.max_ctx_no == 0) {
+ pr_err("Unable to use IOMMU PV driver (no context available)\n");
+ return -ENOTSUPP; /* Unable to use IOMMU PV ? */
+ }
+
+ if (xen_domain_type == XEN_PV_DOMAIN)
+ /* TODO: In PV domain, due to the existing pfn-gfn mapping we need to
+ * consider that under certains circonstances, we have :
+ * pfn_to_gfn(x + 1) != pfn_to_gfn(x) + 1
+ *
+ * In these cases, we would want to separate the subop into several calls.
+ * (only doing the grouped operation when the mapping is actually contigous)
+ * Only map operation would be affected, as unmap actually uses dfn which
+ * doesn't have this kind of mapping.
+ *
+ * Force single-page operations to work arround this issue for now.
+ */
+ max_nr_pages = 1;
+ else
+ /* With HVM domains, pfn_to_gfn is identity, there is no issue regarding this. */
+ max_nr_pages = op.cap.max_nr_pages;
+
+ max_iova_addr = op.cap.max_iova_addr;
+
+ spin_lock_init(&lock);
+
+ ret = iommu_device_sysfs_add(&xen_iommu_device, NULL, NULL, "xen-iommu");
+ if (ret) {
+ pr_err("Unable to add Xen IOMMU sysfs\n");
+ return ret;
+ }
+
+ ret = iommu_device_register(&xen_iommu_device, &xen_iommu_ops, NULL);
+ if (ret) {
+ pr_err("Unable to register Xen IOMMU device %d\n", ret);
+ iommu_device_sysfs_remove(&xen_iommu_device);
+ return ret;
+ }
+
+ /* swiotlb is redundant when IOMMU is active. */
+ x86_swiotlb_enable = false;
+
+ return 0;
+}
+
+void __exit xen_iommu_fini(void)
+{
+ pr_info("Unregistering Xen IOMMU driver\n");
+
+ iommu_device_unregister(&xen_iommu_device);
+ iommu_device_sysfs_remove(&xen_iommu_device);
+}
+
+module_init(xen_iommu_init);
+module_exit(xen_iommu_fini);
diff --git a/include/xen/interface/memory.h b/include/xen/interface/memory.h
index 1a371a825c55..08571add426b 100644
--- a/include/xen/interface/memory.h
+++ b/include/xen/interface/memory.h
@@ -10,6 +10,7 @@
#ifndef __XEN_PUBLIC_MEMORY_H__
#define __XEN_PUBLIC_MEMORY_H__
+#include "xen/interface/physdev.h"
#include <linux/spinlock.h>
/*
@@ -214,6 +215,38 @@ struct xen_add_to_physmap_range {
};
DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap_range);
+/*
+ * With some legacy devices, certain guest-physical addresses cannot safely
+ * be used for other purposes, e.g. to map guest RAM. This hypercall
+ * enumerates those regions so the toolstack can avoid using them.
+ */
+#define XENMEM_reserved_device_memory_map 27
+struct xen_reserved_device_memory {
+ xen_pfn_t start_pfn;
+ xen_ulong_t nr_pages;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory);
+
+struct xen_reserved_device_memory_map {
+#define XENMEM_RDM_ALL 1 /* Request all regions (ignore dev union). */
+ /* IN */
+ uint32_t flags;
+ /*
+ * IN/OUT
+ *
+ * Gets set to the required number of entries when too low,
+ * signaled by error code -ERANGE.
+ */
+ unsigned int nr_entries;
+ /* OUT */
+ GUEST_HANDLE(xen_reserved_device_memory) buffer;
+ /* IN */
+ union {
+ struct physdev_pci_device pci;
+ } dev;
+};
+DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory_map);
+
/*
* Returns the pseudo-physical memory map as it was when the domain
* was started (specified by XENMEM_set_memory_map).
diff --git a/include/xen/interface/pv-iommu.h b/include/xen/interface/pv-iommu.h
new file mode 100644
index 000000000000..5560609d0e7a
--- /dev/null
+++ b/include/xen/interface/pv-iommu.h
@@ -0,0 +1,114 @@
+/* SPDX-License-Identifier: MIT */
+/******************************************************************************
+ * pv-iommu.h
+ *
+ * Paravirtualized IOMMU driver interface.
+ *
+ * Copyright (c) 2024 Teddy Astie <teddy.astie@vates.tech>
+ */
+
+#ifndef __XEN_PUBLIC_PV_IOMMU_H__
+#define __XEN_PUBLIC_PV_IOMMU_H__
+
+#include "xen.h"
+#include "physdev.h"
+
+#define IOMMU_DEFAULT_CONTEXT (0)
+
+/**
+ * Query PV-IOMMU capabilities for this domain.
+ */
+#define IOMMUOP_query_capabilities 1
+
+/**
+ * Allocate an IOMMU context, the new context handle will be written to ctx_no.
+ */
+#define IOMMUOP_alloc_context 2
+
+/**
+ * Destroy a IOMMU context.
+ * All devices attached to this context are reattached to default context.
+ *
+ * The default context can't be destroyed (0).
+ */
+#define IOMMUOP_free_context 3
+
+/**
+ * Reattach the device to IOMMU context.
+ */
+#define IOMMUOP_reattach_device 4
+
+#define IOMMUOP_map_pages 5
+#define IOMMUOP_unmap_pages 6
+
+/**
+ * Get the GFN associated to a specific DFN.
+ */
+#define IOMMUOP_lookup_page 7
+
+struct pv_iommu_op {
+ uint16_t subop_id;
+ uint16_t ctx_no;
+
+/**
+ * Create a context that is cloned from default.
+ * The new context will be populated with 1:1 mappings covering the entire guest memory.
+ */
+#define IOMMU_CREATE_clone (1 << 0)
+
+#define IOMMU_OP_readable (1 << 0)
+#define IOMMU_OP_writeable (1 << 1)
+ uint32_t flags;
+
+ union {
+ struct {
+ uint64_t gfn;
+ uint64_t dfn;
+ /* Number of pages to map */
+ uint32_t nr_pages;
+ /* Number of pages actually mapped after sub-op */
+ uint32_t mapped;
+ } map_pages;
+
+ struct {
+ uint64_t dfn;
+ /* Number of pages to unmap */
+ uint32_t nr_pages;
+ /* Number of pages actually unmapped after sub-op */
+ uint32_t unmapped;
+ } unmap_pages;
+
+ struct {
+ struct physdev_pci_device dev;
+ } reattach_device;
+
+ struct {
+ uint64_t gfn;
+ uint64_t dfn;
+ } lookup_page;
+
+ struct {
+ /* Maximum number of IOMMU context this domain can use. */
+ uint16_t max_ctx_no;
+ /* Maximum number of pages that can be modified in a single map/unmap operation. */
+ uint32_t max_nr_pages;
+ /* Maximum device address (iova) that the guest can use for mappings. */
+ uint64_t max_iova_addr;
+ } cap;
+ };
+};
+
+typedef struct pv_iommu_op pv_iommu_op_t;
+DEFINE_GUEST_HANDLE_STRUCT(pv_iommu_op_t);
+
+#endif
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
\ No newline at end of file
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index 0ca23eca2a9c..8b1daf3fecc6 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -65,6 +65,7 @@
#define __HYPERVISOR_xc_reserved_op 39 /* reserved for XenClient */
#define __HYPERVISOR_xenpmu_op 40
#define __HYPERVISOR_dm_op 41
+#define __HYPERVISOR_iommu_op 43
/* Architecture-specific hypercall definitions. */
#define __HYPERVISOR_arch_0 48
--
2.45.2
Teddy Astie | Vates XCP-ng Intern
XCP-ng & Xen Orchestra - Vates solutions
web: https://vates.tech
On Thu, Jun 13, 2024 at 01:50:22PM +0000, Teddy Astie wrote: > +struct iommu_domain *xen_iommu_domain_alloc(unsigned type) > +{ > + struct xen_iommu_domain *domain; > + u16 ctx_no; > + int ret; > + > + if (type & IOMMU_DOMAIN_IDENTITY) { > + /* use default domain */ > + ctx_no = 0; Please use the new ops, domain_alloc_paging and the static identity domain. > +static struct iommu_group *xen_iommu_device_group(struct device *dev) > +{ > + if (!dev_is_pci(dev)) > + return ERR_PTR(-ENODEV); > + device_group is only called after probe_device, since you already exclude !pci during probe there is no need for this wrapper, just set the op directly to pci_device_group. > +static void xen_iommu_release_device(struct device *dev) > +{ > + int ret; > + struct pci_dev *pdev; > + struct pv_iommu_op op = { > + .subop_id = IOMMUOP_reattach_device, > + .flags = 0, > + .ctx_no = 0 /* reattach device back to default context */ > + }; Consider if you can use release_domain for this, I think this is probably a BLOCKED domain behavior. > + if (!dev_is_pci(dev)) > + return; No op is ever called on a non-probed device, remove all these checks. > +static int xen_iommu_map_pages(struct iommu_domain *domain, unsigned long iova, > + phys_addr_t paddr, size_t pgsize, size_t pgcount, > + int prot, gfp_t gfp, size_t *mapped) > +{ > + size_t xen_pg_count = (pgsize / XEN_PAGE_SIZE) * pgcount; > + struct xen_iommu_domain *dom = to_xen_iommu_domain(domain); > + struct pv_iommu_op op = { > + .subop_id = IOMMUOP_map_pages, > + .flags = 0, > + .ctx_no = dom->ctx_no > + }; > + /* NOTE: paddr is actually bound to pfn, not gfn */ > + uint64_t pfn = addr_to_pfn(paddr); > + uint64_t dfn = addr_to_pfn(iova); > + int ret = 0; > + > + if (WARN(!dom->ctx_no, "Tried to map page to default context")) > + return -EINVAL; A paging domain should be the only domain ops that have a populated map so this should be made impossible by construction. Jason
Hello Jason, Le 19/06/2024 à 18:30, Jason Gunthorpe a écrit : > On Thu, Jun 13, 2024 at 01:50:22PM +0000, Teddy Astie wrote: > >> +struct iommu_domain *xen_iommu_domain_alloc(unsigned type) >> +{ >> + struct xen_iommu_domain *domain; >> + u16 ctx_no; >> + int ret; >> + >> + if (type & IOMMU_DOMAIN_IDENTITY) { >> + /* use default domain */ >> + ctx_no = 0; > > Please use the new ops, domain_alloc_paging and the static identity domain. Yes, in the v2, I will use this newer interface. I have a question on this new interface : is it valid to not have a identity domain (and "default domain" being blocking); well in the current implementation it doesn't really matter, but at some point, we may want to allow not having it (thus making this driver mandatory). > >> +static struct iommu_group *xen_iommu_device_group(struct device *dev) >> +{ >> + if (!dev_is_pci(dev)) >> + return ERR_PTR(-ENODEV); >> + > > device_group is only called after probe_device, since you already > exclude !pci during probe there is no need for this wrapper, just set > the op directly to pci_device_group. > >> + if (!dev_is_pci(dev)) >> + return; > > No op is ever called on a non-probed device, remove all these checks. > > > A paging domain should be the only domain ops that have a populated > map so this should be made impossible by construction Makes sense, will remove these redundant checks in v2. > >> +static void xen_iommu_release_device(struct device *dev) >> +{ >> + int ret; >> + struct pci_dev *pdev; >> + struct pv_iommu_op op = { >> + .subop_id = IOMMUOP_reattach_device, >> + .flags = 0, >> + .ctx_no = 0 /* reattach device back to default context */ >> + }; > > Consider if you can use release_domain for this, I think this is > probably a BLOCKED domain behavior. The goal is to put back all devices where they were at the beginning (the default "context"), which is what release_domain looks like it is doing. Will use it for v2. > > Jason Teddy Teddy Astie | Vates XCP-ng Intern XCP-ng & Xen Orchestra - Vates solutions web: https://vates.tech
On 6/21/24 11:09 PM, Teddy Astie wrote: > Le 19/06/2024 à 18:30, Jason Gunthorpe a écrit : >> On Thu, Jun 13, 2024 at 01:50:22PM +0000, Teddy Astie wrote: >> >>> +struct iommu_domain *xen_iommu_domain_alloc(unsigned type) >>> +{ >>> + struct xen_iommu_domain *domain; >>> + u16 ctx_no; >>> + int ret; >>> + >>> + if (type & IOMMU_DOMAIN_IDENTITY) { >>> + /* use default domain */ >>> + ctx_no = 0; >> Please use the new ops, domain_alloc_paging and the static identity domain. > Yes, in the v2, I will use this newer interface. > > I have a question on this new interface : is it valid to not have a > identity domain (and "default domain" being blocking); well in the > current implementation it doesn't really matter, but at some point, we > may want to allow not having it (thus making this driver mandatory). It's valid to not have an identity domain if "default domain being blocking" means a paging domain with no mappings. In the iommu driver's iommu_ops::def_domain_type callback, just always return IOMMU_DOMAIN_DMA, which indicates that the iommu driver doesn't support identity translation. Best regards, baolu
On 2024-06-23 4:21 am, Baolu Lu wrote: > On 6/21/24 11:09 PM, Teddy Astie wrote: >> Le 19/06/2024 à 18:30, Jason Gunthorpe a écrit : >>> On Thu, Jun 13, 2024 at 01:50:22PM +0000, Teddy Astie wrote: >>> >>>> +struct iommu_domain *xen_iommu_domain_alloc(unsigned type) >>>> +{ >>>> + struct xen_iommu_domain *domain; >>>> + u16 ctx_no; >>>> + int ret; >>>> + >>>> + if (type & IOMMU_DOMAIN_IDENTITY) { >>>> + /* use default domain */ >>>> + ctx_no = 0; >>> Please use the new ops, domain_alloc_paging and the static identity >>> domain. >> Yes, in the v2, I will use this newer interface. >> >> I have a question on this new interface : is it valid to not have a >> identity domain (and "default domain" being blocking); well in the >> current implementation it doesn't really matter, but at some point, we >> may want to allow not having it (thus making this driver mandatory). > > It's valid to not have an identity domain if "default domain being > blocking" means a paging domain with no mappings. > > In the iommu driver's iommu_ops::def_domain_type callback, just always > return IOMMU_DOMAIN_DMA, which indicates that the iommu driver doesn't > support identity translation. That's not necessary - if neither ops->identity_domain nor ops->domain_alloc(IOMMU_DOMAIN_IDENTITY) gives a valid domain then we fall back to IOMMU_DOMAIN_DMA anyway. Thanks, Robin.
On 6/24/24 7:09 PM, Robin Murphy wrote: > On 2024-06-23 4:21 am, Baolu Lu wrote: >> On 6/21/24 11:09 PM, Teddy Astie wrote: >>> Le 19/06/2024 à 18:30, Jason Gunthorpe a écrit : >>>> On Thu, Jun 13, 2024 at 01:50:22PM +0000, Teddy Astie wrote: >>>> >>>>> +struct iommu_domain *xen_iommu_domain_alloc(unsigned type) >>>>> +{ >>>>> + struct xen_iommu_domain *domain; >>>>> + u16 ctx_no; >>>>> + int ret; >>>>> + >>>>> + if (type & IOMMU_DOMAIN_IDENTITY) { >>>>> + /* use default domain */ >>>>> + ctx_no = 0; >>>> Please use the new ops, domain_alloc_paging and the static identity >>>> domain. >>> Yes, in the v2, I will use this newer interface. >>> >>> I have a question on this new interface : is it valid to not have a >>> identity domain (and "default domain" being blocking); well in the >>> current implementation it doesn't really matter, but at some point, we >>> may want to allow not having it (thus making this driver mandatory). >> >> It's valid to not have an identity domain if "default domain being >> blocking" means a paging domain with no mappings. >> >> In the iommu driver's iommu_ops::def_domain_type callback, just always >> return IOMMU_DOMAIN_DMA, which indicates that the iommu driver doesn't >> support identity translation. > > That's not necessary - if neither ops->identity_domain nor > ops->domain_alloc(IOMMU_DOMAIN_IDENTITY) gives a valid domain then we > fall back to IOMMU_DOMAIN_DMA anyway. Yes. That's true. Best regards, baolu
On 13.06.2024 15:50, Teddy Astie wrote: > @@ -214,6 +215,38 @@ struct xen_add_to_physmap_range { > }; > DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap_range); > > +/* > + * With some legacy devices, certain guest-physical addresses cannot safely > + * be used for other purposes, e.g. to map guest RAM. This hypercall > + * enumerates those regions so the toolstack can avoid using them. > + */ > +#define XENMEM_reserved_device_memory_map 27 > +struct xen_reserved_device_memory { > + xen_pfn_t start_pfn; > + xen_ulong_t nr_pages; > +}; > +DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory); > + > +struct xen_reserved_device_memory_map { > +#define XENMEM_RDM_ALL 1 /* Request all regions (ignore dev union). */ > + /* IN */ > + uint32_t flags; > + /* > + * IN/OUT > + * > + * Gets set to the required number of entries when too low, > + * signaled by error code -ERANGE. > + */ > + unsigned int nr_entries; > + /* OUT */ > + GUEST_HANDLE(xen_reserved_device_memory) buffer; > + /* IN */ > + union { > + struct physdev_pci_device pci; > + } dev; > +}; > +DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory_map); This is a tools-only (i.e. unstable) sub-function in Xen; even the comment at the top says "toolstack". It is therefore not suitable for use in a kernel. > --- /dev/null > +++ b/include/xen/interface/pv-iommu.h > @@ -0,0 +1,114 @@ > +/* SPDX-License-Identifier: MIT */ > +/****************************************************************************** > + * pv-iommu.h > + * > + * Paravirtualized IOMMU driver interface. > + * > + * Copyright (c) 2024 Teddy Astie <teddy.astie@vates.tech> > + */ > + > +#ifndef __XEN_PUBLIC_PV_IOMMU_H__ > +#define __XEN_PUBLIC_PV_IOMMU_H__ > + > +#include "xen.h" > +#include "physdev.h" > + > +#define IOMMU_DEFAULT_CONTEXT (0) > + > +/** > + * Query PV-IOMMU capabilities for this domain. > + */ > +#define IOMMUOP_query_capabilities 1 > + > +/** > + * Allocate an IOMMU context, the new context handle will be written to ctx_no. > + */ > +#define IOMMUOP_alloc_context 2 > + > +/** > + * Destroy a IOMMU context. > + * All devices attached to this context are reattached to default context. > + * > + * The default context can't be destroyed (0). > + */ > +#define IOMMUOP_free_context 3 > + > +/** > + * Reattach the device to IOMMU context. > + */ > +#define IOMMUOP_reattach_device 4 > + > +#define IOMMUOP_map_pages 5 > +#define IOMMUOP_unmap_pages 6 > + > +/** > + * Get the GFN associated to a specific DFN. > + */ > +#define IOMMUOP_lookup_page 7 > + > +struct pv_iommu_op { > + uint16_t subop_id; > + uint16_t ctx_no; > + > +/** > + * Create a context that is cloned from default. > + * The new context will be populated with 1:1 mappings covering the entire guest memory. > + */ > +#define IOMMU_CREATE_clone (1 << 0) > + > +#define IOMMU_OP_readable (1 << 0) > +#define IOMMU_OP_writeable (1 << 1) > + uint32_t flags; > + > + union { > + struct { > + uint64_t gfn; > + uint64_t dfn; > + /* Number of pages to map */ > + uint32_t nr_pages; > + /* Number of pages actually mapped after sub-op */ > + uint32_t mapped; > + } map_pages; > + > + struct { > + uint64_t dfn; > + /* Number of pages to unmap */ > + uint32_t nr_pages; > + /* Number of pages actually unmapped after sub-op */ > + uint32_t unmapped; > + } unmap_pages; > + > + struct { > + struct physdev_pci_device dev; > + } reattach_device; > + > + struct { > + uint64_t gfn; > + uint64_t dfn; > + } lookup_page; > + > + struct { > + /* Maximum number of IOMMU context this domain can use. */ > + uint16_t max_ctx_no; > + /* Maximum number of pages that can be modified in a single map/unmap operation. */ > + uint32_t max_nr_pages; > + /* Maximum device address (iova) that the guest can use for mappings. */ > + uint64_t max_iova_addr; > + } cap; > + }; > +}; > + > +typedef struct pv_iommu_op pv_iommu_op_t; > +DEFINE_GUEST_HANDLE_STRUCT(pv_iommu_op_t); > + > +#endif > + > +/* > + * Local variables: > + * mode: C > + * c-file-style: "BSD" > + * c-basic-offset: 4 > + * tab-width: 4 > + * indent-tabs-mode: nil > + * End: > + */ > \ No newline at end of file Nit: I'm pretty sure you want to avoid this. Jan
Le 13/06/2024 à 16:32, Jan Beulich a écrit : > On 13.06.2024 15:50, Teddy Astie wrote: >> @@ -214,6 +215,38 @@ struct xen_add_to_physmap_range { >> }; >> DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap_range); >> >> +/* >> + * With some legacy devices, certain guest-physical addresses cannot safely >> + * be used for other purposes, e.g. to map guest RAM. This hypercall >> + * enumerates those regions so the toolstack can avoid using them. >> + */ >> +#define XENMEM_reserved_device_memory_map 27 >> +struct xen_reserved_device_memory { >> + xen_pfn_t start_pfn; >> + xen_ulong_t nr_pages; >> +}; >> +DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory); >> + >> +struct xen_reserved_device_memory_map { >> +#define XENMEM_RDM_ALL 1 /* Request all regions (ignore dev union). */ >> + /* IN */ >> + uint32_t flags; >> + /* >> + * IN/OUT >> + * >> + * Gets set to the required number of entries when too low, >> + * signaled by error code -ERANGE. >> + */ >> + unsigned int nr_entries; >> + /* OUT */ >> + GUEST_HANDLE(xen_reserved_device_memory) buffer; >> + /* IN */ >> + union { >> + struct physdev_pci_device pci; >> + } dev; >> +}; >> +DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory_map); > > This is a tools-only (i.e. unstable) sub-function in Xen; even the comment > at the top says "toolstack". It is therefore not suitable for use in a > kernel. > IMO this comment actually describes how the toolstack uses the hypercall, but I don't think it is actually reserved for toolstack use. Or maybe we should allow the kernel to use this hypercall as well. >> + >> +/* >> + * Local variables: >> + * mode: C >> + * c-file-style: "BSD" >> + * c-basic-offset: 4 >> + * tab-width: 4 >> + * indent-tabs-mode: nil >> + * End: >> + */ >> \ No newline at end of file > > Nit: I'm pretty sure you want to avoid this. > Indeed. Teddy Teddy Astie | Vates XCP-ng Intern XCP-ng & Xen Orchestra - Vates solutions web: https://vates.tech
On 17.06.2024 15:36, Teddy Astie wrote: > Le 13/06/2024 à 16:32, Jan Beulich a écrit : >> On 13.06.2024 15:50, Teddy Astie wrote: >>> @@ -214,6 +215,38 @@ struct xen_add_to_physmap_range { >>> }; >>> DEFINE_GUEST_HANDLE_STRUCT(xen_add_to_physmap_range); >>> >>> +/* >>> + * With some legacy devices, certain guest-physical addresses cannot safely >>> + * be used for other purposes, e.g. to map guest RAM. This hypercall >>> + * enumerates those regions so the toolstack can avoid using them. >>> + */ >>> +#define XENMEM_reserved_device_memory_map 27 >>> +struct xen_reserved_device_memory { >>> + xen_pfn_t start_pfn; >>> + xen_ulong_t nr_pages; >>> +}; >>> +DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory); >>> + >>> +struct xen_reserved_device_memory_map { >>> +#define XENMEM_RDM_ALL 1 /* Request all regions (ignore dev union). */ >>> + /* IN */ >>> + uint32_t flags; >>> + /* >>> + * IN/OUT >>> + * >>> + * Gets set to the required number of entries when too low, >>> + * signaled by error code -ERANGE. >>> + */ >>> + unsigned int nr_entries; >>> + /* OUT */ >>> + GUEST_HANDLE(xen_reserved_device_memory) buffer; >>> + /* IN */ >>> + union { >>> + struct physdev_pci_device pci; >>> + } dev; >>> +}; >>> +DEFINE_GUEST_HANDLE_STRUCT(xen_reserved_device_memory_map); >> >> This is a tools-only (i.e. unstable) sub-function in Xen; even the comment >> at the top says "toolstack". It is therefore not suitable for use in a >> kernel. >> > IMO this comment actually describes how the toolstack uses the > hypercall, but I don't think it is actually reserved for toolstack use. Well, the canonical version of the header is quite explicit about this, by having the definition in a __XEN__ || __XEN_TOOLS__ section. > Or maybe we should allow the kernel to use this hypercall as well. That's an option to consider. Jan
© 2016 - 2024 Red Hat, Inc.