[v3] Linux RISC-V IOMMU Support

[PATCH v3 7/7] iommu/riscv: Paging domain support

Posted by Tomasz Jeznach 1 year, 9 months ago

Introduce first-stage address translation support.

Page table configured by the IOMMU driver will use the same format
as the CPU’s MMU, and will fallback to identity translation if the
page table format configured for the MMU is not supported by the
IOMMU hardware.

This change introduces IOTINVAL.VMA command, required to invalidate
any cached IOATC entries after mapping is updated and/or removed from
the paging domain.  Invalidations for the non-leaf page entries use
IOTINVAL for all addresses assigned to the protection domain for
hardware not supporting more granular non-leaf page table cache
invalidations.

Signed-off-by: Tomasz Jeznach <tjeznach@rivosinc.com>
---
 drivers/iommu/riscv/iommu.c | 544 +++++++++++++++++++++++++++++++++++-
 1 file changed, 541 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index 98c325cb0591..60b146adfcbb 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -40,6 +40,10 @@
 
 #define dev_to_iommu(dev) iommu_get_iommu_dev(dev, struct riscv_iommu_device, iommu)
 
+/* IOMMU PSCID allocation namespace. */
+static DEFINE_IDA(riscv_iommu_pscids);
+#define RISCV_IOMMU_MAX_PSCID		(BIT(20) - 1)
+
 /* Device resource-managed allocations */
 struct riscv_iommu_devres {
 	void *addr;
@@ -747,6 +751,136 @@ static int riscv_iommu_iodir_set_mode(struct riscv_iommu_device *iommu,
 	return 0;
 }
 
+/* This struct contains protection domain specific IOMMU driver data. */
+struct riscv_iommu_domain {
+	struct iommu_domain domain;
+	struct list_head bonds;
+	spinlock_t lock;		/* protect bonds list updates. */
+	int pscid;
+	int numa_node;
+	int amo_enabled:1;
+	unsigned int pgd_mode;
+	unsigned long *pgd_root;
+};
+
+#define iommu_domain_to_riscv(iommu_domain) \
+	container_of(iommu_domain, struct riscv_iommu_domain, domain)
+
+#define dev_to_domain(dev) \
+	iommu_domain_to_riscv(dev_iommu_priv_get(dev))
+
+/* Linkage between an iommu_domain and attached devices. */
+struct riscv_iommu_bond {
+	struct list_head list;
+	struct rcu_head rcu;
+	struct device *dev;
+};
+
+static int riscv_iommu_bond_link(struct riscv_iommu_domain *domain, struct device *dev)
+{
+	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+	struct riscv_iommu_bond *bond;
+	struct list_head *bonds;
+	unsigned long flags;
+
+	bond = kzalloc(sizeof(*bond), GFP_KERNEL);
+	if (!bond)
+		return -ENOMEM;
+	bond->dev = dev;
+
+	/* List od devices attached to the domain are arranged based on managed IOMMU device */
+	spin_lock_irqsave(&domain->lock, flags);
+	list_for_each_rcu(bonds, &domain->bonds)
+		if (dev_to_iommu(list_entry(bonds, struct riscv_iommu_bond, list)->dev) == iommu)
+			break;
+	list_add_rcu(&bond->list, bonds);
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	return 0;
+}
+
+static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain, struct device *dev)
+{
+	struct riscv_iommu_bond *bond, *found = NULL;
+	unsigned long flags;
+
+	if (!domain)
+		return;
+
+	spin_lock_irqsave(&domain->lock, flags);
+	list_for_each_entry_rcu(bond, &domain->bonds, list) {
+		if (bond->dev == dev) {
+			list_del_rcu(&bond->list);
+			found = bond;
+		}
+	}
+	spin_unlock_irqrestore(&domain->lock, flags);
+
+	/* Release and wait for all read-rcu critical sections have completed. */
+	kfree_rcu(found, rcu);
+	synchronize_rcu();
+}
+
+/*
+ * Send IOTLB.INVAL for whole address space for ranges larger than 2MB.
+ * This limit will be replaced with range invalidations, if supported by
+ * the hardware, when RISC-V IOMMU architecture specification update for
+ * range invalidations update will be available.
+ */
+#define RISCV_IOMMU_IOTLB_INVAL_LIMIT	(2 << 20)
+
+static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
+				    unsigned long start, unsigned long end)
+{
+	struct riscv_iommu_bond *bond;
+	struct riscv_iommu_device *iommu, *prev;
+	struct riscv_iommu_command cmd;
+	unsigned long len = end - start + 1;
+	unsigned long iova;
+
+	rcu_read_lock();
+
+	prev = NULL;
+	list_for_each_entry_rcu(bond, &domain->bonds, list) {
+		iommu = dev_to_iommu(bond->dev);
+
+		riscv_iommu_cmd_inval_vma(&cmd);
+		riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
+		if (len && len >= RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
+			for (iova = start; iova < end; iova += PAGE_SIZE) {
+				riscv_iommu_cmd_inval_set_addr(&cmd, iova);
+				riscv_iommu_cmd_send(iommu, &cmd, 0);
+			}
+		} else {
+			riscv_iommu_cmd_send(iommu, &cmd, 0);
+		}
+
+		/*
+		 * IOTLB invalidation request can be safely omitted if already sent
+		 * to the IOMMU for the same PSCID, and with domain->bonds list
+		 * arranged based on the device's IOMMU, it's sufficient to check
+		 * last device the invalidation was sent to.
+		 */
+		if (iommu == prev)
+			continue;
+
+		prev = iommu;
+		riscv_iommu_cmd_send(iommu, &cmd, 0);
+	}
+
+	prev = NULL;
+	list_for_each_entry_rcu(bond, &domain->bonds, list) {
+		iommu = dev_to_iommu(bond->dev);
+		if (iommu == prev)
+			continue;
+
+		prev = iommu;
+		riscv_iommu_cmd_iofence(&cmd);
+		riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_QUEUE_TIMEOUT);
+	}
+	rcu_read_unlock();
+}
+
 #define RISCV_IOMMU_FSC_BARE 0
 
 /*
@@ -766,10 +900,29 @@ static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
 {
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct riscv_iommu_dc *dc;
+	struct riscv_iommu_command cmd;
+	bool sync_required = false;
 	u64 tc;
 	int i;
 
-	/* Device context invalidation ignored for now. */
+	for (i = 0; i < fwspec->num_ids; i++) {
+		dc = riscv_iommu_get_dc(iommu, fwspec->ids[i]);
+		tc = READ_ONCE(dc->tc);
+		if (tc & RISCV_IOMMU_DC_TC_V) {
+			WRITE_ONCE(dc->tc, tc & ~RISCV_IOMMU_DC_TC_V);
+
+			/* Invalidate device context cached values */
+			riscv_iommu_cmd_iodir_inval_ddt(&cmd);
+			riscv_iommu_cmd_iodir_set_did(&cmd, fwspec->ids[i]);
+			riscv_iommu_cmd_send(iommu, &cmd, 0);
+			sync_required = true;
+		}
+	}
+
+	if (sync_required) {
+		riscv_iommu_cmd_iofence(&cmd);
+		riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_IOTINVAL_TIMEOUT);
+	}
 
 	/*
 	 * For device context with DC_TC_PDTV = 0, translation attributes valid bit
@@ -787,13 +940,389 @@ static void riscv_iommu_iodir_update(struct riscv_iommu_device *iommu,
 	}
 }
 
+/*
+ * IOVA page translation tree management.
+ */
+
+#define IOMMU_PAGE_SIZE_4K     BIT_ULL(12)
+#define IOMMU_PAGE_SIZE_2M     BIT_ULL(21)
+#define IOMMU_PAGE_SIZE_1G     BIT_ULL(30)
+#define IOMMU_PAGE_SIZE_512G   BIT_ULL(39)
+
+#define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t)))
+
+static void riscv_iommu_iotlb_flush_all(struct iommu_domain *iommu_domain)
+{
+	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+
+	riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
+}
+
+static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain,
+				   struct iommu_iotlb_gather *gather)
+{
+	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+
+	riscv_iommu_iotlb_inval(domain, gather->start, gather->end);
+}
+
+static inline size_t get_page_size(size_t size)
+{
+	if (size >= IOMMU_PAGE_SIZE_512G)
+		return IOMMU_PAGE_SIZE_512G;
+	if (size >= IOMMU_PAGE_SIZE_1G)
+		return IOMMU_PAGE_SIZE_1G;
+	if (size >= IOMMU_PAGE_SIZE_2M)
+		return IOMMU_PAGE_SIZE_2M;
+	return IOMMU_PAGE_SIZE_4K;
+}
+
+#define _io_pte_present(pte)	((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
+#define _io_pte_leaf(pte)	((pte) & _PAGE_LEAF)
+#define _io_pte_none(pte)	((pte) == 0)
+#define _io_pte_entry(pn, prot)	((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot))
+
+static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
+				 unsigned long pte, struct list_head *freelist)
+{
+	unsigned long *ptr;
+	int i;
+
+	if (!_io_pte_present(pte) || _io_pte_leaf(pte))
+		return;
+
+	ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+
+	/* Recursively free all sub page table pages */
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = READ_ONCE(ptr[i]);
+		if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte)
+			riscv_iommu_pte_free(domain, pte, freelist);
+	}
+
+	if (freelist)
+		list_add_tail(&virt_to_page(ptr)->lru, freelist);
+	else
+		iommu_free_page(ptr);
+}
+
+static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
+					    unsigned long iova, size_t pgsize, gfp_t gfp)
+{
+	unsigned long *ptr = domain->pgd_root;
+	unsigned long pte, old;
+	int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
+	void *addr;
+
+	do {
+		const int shift = PAGE_SHIFT + PT_SHIFT * level;
+
+		ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
+		/*
+		 * Note: returned entry might be a non-leaf if there was existing mapping
+		 * with smaller granularity. Up to the caller to replace and invalidate.
+		 */
+		if (((size_t)1 << shift) == pgsize)
+			return ptr;
+pte_retry:
+		pte = READ_ONCE(*ptr);
+		/*
+		 * This is very likely incorrect as we should not be adding new mapping
+		 * with smaller granularity on top of existing 2M/1G mapping. Fail.
+		 */
+		if (_io_pte_present(pte) && _io_pte_leaf(pte))
+			return NULL;
+		/*
+		 * Non-leaf entry is missing, allocate and try to add to the page table.
+		 * This might race with other mappings, retry on error.
+		 */
+		if (_io_pte_none(pte)) {
+			addr = iommu_alloc_page_node(domain->numa_node, gfp);
+			if (!addr)
+				return NULL;
+			old = pte;
+			pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE);
+			if (cmpxchg_relaxed(ptr, old, pte) != old) {
+				iommu_free_page(addr);
+				goto pte_retry;
+			}
+		}
+		ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+	} while (level-- > 0);
+
+	return NULL;
+}
+
+static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain,
+					    unsigned long iova, size_t *pte_pgsize)
+{
+	unsigned long *ptr = domain->pgd_root;
+	unsigned long pte;
+	int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
+
+	do {
+		const int shift = PAGE_SHIFT + PT_SHIFT * level;
+
+		ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
+		pte = READ_ONCE(*ptr);
+		if (_io_pte_present(pte) && _io_pte_leaf(pte)) {
+			*pte_pgsize = (size_t)1 << shift;
+			return ptr;
+		}
+		if (_io_pte_none(pte))
+			return NULL;
+		ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+	} while (level-- > 0);
+
+	return NULL;
+}
+
+static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
+				 unsigned long iova, phys_addr_t phys,
+				 size_t pgsize, size_t pgcount, int prot,
+				 gfp_t gfp, size_t *mapped)
+{
+	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+	size_t size = 0;
+	size_t page_size = get_page_size(pgsize);
+	unsigned long *ptr;
+	unsigned long pte, old, pte_prot;
+	int rc = 0;
+	LIST_HEAD(freelist);
+
+	if (!(prot & IOMMU_WRITE))
+		pte_prot = _PAGE_BASE | _PAGE_READ;
+	else if (domain->amo_enabled)
+		pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE;
+	else
+		pte_prot = _PAGE_BASE | _PAGE_READ | _PAGE_WRITE | _PAGE_DIRTY;
+
+	while (pgcount) {
+		ptr = riscv_iommu_pte_alloc(domain, iova, page_size, gfp);
+		if (!ptr) {
+			rc = -ENOMEM;
+			break;
+		}
+
+		old = READ_ONCE(*ptr);
+		pte = _io_pte_entry(phys_to_pfn(phys), pte_prot);
+		if (cmpxchg_relaxed(ptr, old, pte) != old)
+			continue;
+
+		riscv_iommu_pte_free(domain, old, &freelist);
+
+		size += page_size;
+		iova += page_size;
+		phys += page_size;
+		--pgcount;
+	}
+
+	*mapped = size;
+
+	if (!list_empty(&freelist)) {
+		/*
+		 * In 1.0 spec version, the smallest scope we can use to invalidate
+		 * all levels of page table (i.e. leaf and non-leaf) is an
+		 * invalidate-all-PSCID IOTINVAL.VMA with AV=0. This will be updated
+		 * with hardware support for capability.NL (non-leaf) IOTINVAL command.
+		 */
+		riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
+		iommu_put_pages_list(&freelist);
+	}
+
+	return rc;
+}
+
+static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
+				      unsigned long iova, size_t pgsize, size_t pgcount,
+				      struct iommu_iotlb_gather *gather)
+{
+	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+	size_t size = pgcount << __ffs(pgsize);
+	unsigned long *ptr, old;
+	size_t unmapped = 0;
+	size_t pte_size;
+
+	while (unmapped < size) {
+		ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
+		if (!ptr)
+			return unmapped;
+
+		/* partial unmap is not allowed, fail. */
+		if (iova & (pte_size - 1))
+			return unmapped;
+
+		old = READ_ONCE(*ptr);
+		if (cmpxchg_relaxed(ptr, old, 0) != old)
+			continue;
+
+		iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
+					    pte_size);
+
+		iova += pte_size;
+		unmapped += pte_size;
+	}
+
+	return unmapped;
+}
+
+static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain, dma_addr_t iova)
+{
+	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+	unsigned long pte_size;
+	unsigned long *ptr;
+
+	ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
+	if (_io_pte_none(*ptr) || !_io_pte_present(*ptr))
+		return 0;
+
+	return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1));
+}
+
+static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
+{
+	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+	const unsigned long pfn = virt_to_pfn(domain->pgd_root);
+
+	WARN_ON(!list_empty(&domain->bonds));
+
+	if ((int)domain->pscid > 0)
+		ida_free(&riscv_iommu_pscids, domain->pscid);
+
+	riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL);
+	kfree(domain);
+}
+
+static bool riscv_iommu_pt_supported(struct riscv_iommu_device *iommu, int pgd_mode)
+{
+	switch (pgd_mode) {
+	case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
+		return iommu->caps & RISCV_IOMMU_CAP_S_SV39;
+
+	case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
+		return iommu->caps & RISCV_IOMMU_CAP_S_SV48;
+
+	case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
+		return iommu->caps & RISCV_IOMMU_CAP_S_SV57;
+	}
+	return false;
+}
+
+static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain,
+					    struct device *dev)
+{
+	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+	struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
+	u64 fsc, ta;
+
+	if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode))
+		return -ENODEV;
+
+	if (domain->numa_node == NUMA_NO_NODE)
+		domain->numa_node = dev_to_node(iommu->dev);
+
+	if (riscv_iommu_bond_link(domain, dev))
+		return -ENOMEM;
+
+	/*
+	 * Invalidate PSCID.
+	 * This invalidation might be redundant if IOATC has been already cleared,
+	 * however we are not keeping track for domains not linked to a device.
+	 */
+	riscv_iommu_iotlb_inval(domain, 0, ULONG_MAX);
+
+	fsc = FIELD_PREP(RISCV_IOMMU_PC_FSC_MODE, domain->pgd_mode) |
+	      FIELD_PREP(RISCV_IOMMU_PC_FSC_PPN, virt_to_pfn(domain->pgd_root));
+	ta = FIELD_PREP(RISCV_IOMMU_PC_TA_PSCID, domain->pscid) | RISCV_IOMMU_PC_TA_V;
+
+	riscv_iommu_iodir_update(iommu, dev, fsc, ta);
+	riscv_iommu_bond_unlink(dev_to_domain(dev), dev);
+	dev_iommu_priv_set(dev, domain);
+
+	return 0;
+}
+
+static const struct iommu_domain_ops riscv_iommu_paging_domain_ops = {
+	.attach_dev = riscv_iommu_attach_paging_domain,
+	.free = riscv_iommu_free_paging_domain,
+	.map_pages = riscv_iommu_map_pages,
+	.unmap_pages = riscv_iommu_unmap_pages,
+	.iova_to_phys = riscv_iommu_iova_to_phys,
+	.iotlb_sync = riscv_iommu_iotlb_sync,
+	.flush_iotlb_all = riscv_iommu_iotlb_flush_all,
+};
+
+static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
+{
+	struct riscv_iommu_domain *domain;
+	struct riscv_iommu_device *iommu;
+
+	iommu = dev ? dev_to_iommu(dev) : NULL;
+	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
+	if (!domain)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD_RCU(&domain->bonds);
+	spin_lock_init(&domain->lock);
+	domain->numa_node = NUMA_NO_NODE;
+
+	/*
+	 * Follow system address translation mode.
+	 * RISC-V IOMMU ATP mode values match RISC-V CPU SATP mode values.
+	 */
+	domain->pgd_mode = satp_mode >> SATP_MODE_SHIFT;
+	if (iommu) {
+		if (!riscv_iommu_pt_supported(iommu, domain->pgd_mode)) {
+			dev_err(dev, "unsupported page table mode %x\n", domain->pgd_mode);
+			kfree(domain);
+			return ERR_PTR(-ENODEV);
+		}
+		domain->numa_node = dev_to_node(iommu->dev);
+		domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAP_AMO_HWAD);
+	}
+
+	domain->pgd_root = iommu_alloc_page_node(domain->numa_node, GFP_KERNEL_ACCOUNT);
+	if (!domain->pgd_root) {
+		kfree(domain);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1, RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
+	if (domain->pscid < 0) {
+		iommu_free_page(domain->pgd_root);
+		kfree(domain);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/*
+	 * Note: RISC-V Privilege spec mandates that virtual addresses
+	 * need to be sign-extended, so if (VA_BITS - 1) is set, all
+	 * bits >= VA_BITS need to also be set or else we'll get a
+	 * page fault. However the code that creates the mappings
+	 * above us (e.g. iommu_dma_alloc_iova()) won't do that for us
+	 * for now, so we'll end up with invalid virtual addresses
+	 * to map. As a workaround until we get this sorted out
+	 * limit the available virtual addresses to VA_BITS - 1.
+	 */
+	domain->domain.geometry.aperture_start = 0;
+	domain->domain.geometry.aperture_end = DMA_BIT_MASK(VA_BITS - 1);
+	domain->domain.geometry.force_aperture = true;
+
+	domain->domain.ops = &riscv_iommu_paging_domain_ops;
+
+	return &domain->domain;
+}
+
 static int riscv_iommu_attach_release_domain(struct iommu_domain *iommu_domain,
 					     struct device *dev)
 {
 	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
 
-	if (iommu->ddt_mode > RISCV_IOMMU_DDTP_MODE_BARE)
+	if (iommu->ddt_mode > RISCV_IOMMU_DDTP_MODE_BARE) {
 		riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, 0);
+		riscv_iommu_bond_unlink(dev_to_domain(dev), dev);
+		dev_iommu_priv_set(dev, NULL);
+	}
 
 	return 0;
 }
@@ -815,6 +1344,8 @@ static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain,
 		return 0;
 
 	riscv_iommu_iodir_update(iommu, dev, RISCV_IOMMU_FSC_BARE, RISCV_IOMMU_PC_TA_V);
+	riscv_iommu_bond_unlink(dev_to_domain(dev), dev);
+	dev_iommu_priv_set(dev, NULL);
 
 	return 0;
 }
@@ -828,7 +1359,12 @@ static struct iommu_domain riscv_iommu_identity_domain = {
 
 static int riscv_iommu_device_domain_type(struct device *dev)
 {
-	return IOMMU_DOMAIN_IDENTITY;
+	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
+
+	if (iommu->ddt_mode == RISCV_IOMMU_DDTP_MODE_BARE)
+		return IOMMU_DOMAIN_IDENTITY;
+
+	return 0;
 }
 
 static struct iommu_group *riscv_iommu_device_group(struct device *dev)
@@ -880,9 +1416,11 @@ static struct iommu_device *riscv_iommu_probe_device(struct device *dev)
 }
 
 static const struct iommu_ops riscv_iommu_ops = {
+	.pgsize_bitmap = SZ_4K,
 	.of_xlate = riscv_iommu_of_xlate,
 	.identity_domain = &riscv_iommu_identity_domain,
 	.release_domain = &riscv_iommu_release_domain,
+	.domain_alloc_paging = riscv_iommu_alloc_paging_domain,
 	.def_domain_type = riscv_iommu_device_domain_type,
 	.device_group = riscv_iommu_device_group,
 	.probe_device = riscv_iommu_probe_device,
-- 
2.34.1

Re: [PATCH v3 7/7] iommu/riscv: Paging domain support

Posted by Baolu Lu 1 year, 9 months ago

On 5/1/24 4:01 AM, Tomasz Jeznach wrote:
> +/*
> + * Send IOTLB.INVAL for whole address space for ranges larger than 2MB.
> + * This limit will be replaced with range invalidations, if supported by
> + * the hardware, when RISC-V IOMMU architecture specification update for
> + * range invalidations update will be available.
> + */
> +#define RISCV_IOMMU_IOTLB_INVAL_LIMIT	(2 << 20)
> +
> +static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
> +				    unsigned long start, unsigned long end)
> +{
> +	struct riscv_iommu_bond *bond;
> +	struct riscv_iommu_device *iommu, *prev;
> +	struct riscv_iommu_command cmd;
> +	unsigned long len = end - start + 1;
> +	unsigned long iova;
> +
> +	rcu_read_lock();
> +
> +	prev = NULL;
> +	list_for_each_entry_rcu(bond, &domain->bonds, list) {
> +		iommu = dev_to_iommu(bond->dev);
> +
> +		riscv_iommu_cmd_inval_vma(&cmd);
> +		riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
> +		if (len && len >= RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
> +			for (iova = start; iova < end; iova += PAGE_SIZE) {
> +				riscv_iommu_cmd_inval_set_addr(&cmd, iova);
> +				riscv_iommu_cmd_send(iommu, &cmd, 0);
> +			}
> +		} else {
> +			riscv_iommu_cmd_send(iommu, &cmd, 0);
> +		}
> +
> +		/*
> +		 * IOTLB invalidation request can be safely omitted if already sent
> +		 * to the IOMMU for the same PSCID, and with domain->bonds list
> +		 * arranged based on the device's IOMMU, it's sufficient to check
> +		 * last device the invalidation was sent to.
> +		 */
> +		if (iommu == prev)
> +			continue;
> +
> +		prev = iommu;
> +		riscv_iommu_cmd_send(iommu, &cmd, 0);
> +	}

I don't quite follow why not moving "if (iommu == prev)" check to the
top and removing the last riscv_iommu_cmd_send(). My understanding is
that we could make it simply like below:

	prev = NULL;
	list_for_each_entry_rcu(bond, &domain->bonds, list) {
		iommu = dev_to_iommu(bond->dev);
		if (iommu == prev)
			continue;

		/*
		 * Send an invalidation request to the request queue
		 * without wait.
		 */
		... ...

		prev = iommu;
	}

> +
> +	prev = NULL;
> +	list_for_each_entry_rcu(bond, &domain->bonds, list) {
> +		iommu = dev_to_iommu(bond->dev);
> +		if (iommu == prev)
> +			continue;
> +
> +		prev = iommu;
> +		riscv_iommu_cmd_iofence(&cmd);
> +		riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_QUEUE_TIMEOUT);
> +	}
> +	rcu_read_unlock();
> +}

Best regards,
baolu

Re: [PATCH v3 7/7] iommu/riscv: Paging domain support

Posted by Tomasz Jeznach 1 year, 9 months ago

On Wed, May 1, 2024 at 8:52 PM Baolu Lu <baolu.lu@linux.intel.com> wrote:
>
> On 5/1/24 4:01 AM, Tomasz Jeznach wrote:
> > +/*
> > + * Send IOTLB.INVAL for whole address space for ranges larger than 2MB.
> > + * This limit will be replaced with range invalidations, if supported by
> > + * the hardware, when RISC-V IOMMU architecture specification update for
> > + * range invalidations update will be available.
> > + */
> > +#define RISCV_IOMMU_IOTLB_INVAL_LIMIT        (2 << 20)
> > +
> > +static void riscv_iommu_iotlb_inval(struct riscv_iommu_domain *domain,
> > +                                 unsigned long start, unsigned long end)
> > +{
> > +     struct riscv_iommu_bond *bond;
> > +     struct riscv_iommu_device *iommu, *prev;
> > +     struct riscv_iommu_command cmd;
> > +     unsigned long len = end - start + 1;
> > +     unsigned long iova;
> > +
> > +     rcu_read_lock();
> > +
> > +     prev = NULL;
> > +     list_for_each_entry_rcu(bond, &domain->bonds, list) {
> > +             iommu = dev_to_iommu(bond->dev);
> > +
> > +             riscv_iommu_cmd_inval_vma(&cmd);
> > +             riscv_iommu_cmd_inval_set_pscid(&cmd, domain->pscid);
> > +             if (len && len >= RISCV_IOMMU_IOTLB_INVAL_LIMIT) {
> > +                     for (iova = start; iova < end; iova += PAGE_SIZE) {
> > +                             riscv_iommu_cmd_inval_set_addr(&cmd, iova);
> > +                             riscv_iommu_cmd_send(iommu, &cmd, 0);
> > +                     }
> > +             } else {
> > +                     riscv_iommu_cmd_send(iommu, &cmd, 0);
> > +             }
> > +
> > +             /*
> > +              * IOTLB invalidation request can be safely omitted if already sent
> > +              * to the IOMMU for the same PSCID, and with domain->bonds list
> > +              * arranged based on the device's IOMMU, it's sufficient to check
> > +              * last device the invalidation was sent to.
> > +              */
> > +             if (iommu == prev)
> > +                     continue;
> > +
> > +             prev = iommu;
> > +             riscv_iommu_cmd_send(iommu, &cmd, 0);
> > +     }
>
> I don't quite follow why not moving "if (iommu == prev)" check to the
> top and removing the last riscv_iommu_cmd_send(). My understanding is
> that we could make it simply like below:
>
>         prev = NULL;
>         list_for_each_entry_rcu(bond, &domain->bonds, list) {
>                 iommu = dev_to_iommu(bond->dev);
>                 if (iommu == prev)
>                         continue;
>
>                 /*
>                  * Send an invalidation request to the request queue
>                  * without wait.
>                  */
>                 ... ...
>
>                 prev = iommu;
>         }
>

Oh. Thanks for spotting that.
Code section reordered very likely during rebasing patches...

> > +
> > +     prev = NULL;
> > +     list_for_each_entry_rcu(bond, &domain->bonds, list) {
> > +             iommu = dev_to_iommu(bond->dev);
> > +             if (iommu == prev)
> > +                     continue;
> > +
> > +             prev = iommu;
> > +             riscv_iommu_cmd_iofence(&cmd);
> > +             riscv_iommu_cmd_send(iommu, &cmd, RISCV_IOMMU_QUEUE_TIMEOUT);
> > +     }
> > +     rcu_read_unlock();
> > +}
>
> Best regards,
> baolu

Best regards,
- Tomasz

Re: [PATCH v3 7/7] iommu/riscv: Paging domain support

Posted by Jason Gunthorpe 1 year, 9 months ago

On Tue, Apr 30, 2024 at 01:01:57PM -0700, Tomasz Jeznach wrote:

> +#define iommu_domain_to_riscv(iommu_domain) \
> +	container_of(iommu_domain, struct riscv_iommu_domain, domain)
> +
> +#define dev_to_domain(dev) \
> +	iommu_domain_to_riscv(dev_iommu_priv_get(dev))

Please use the priv properly and put a struct around it, you'll
certainly need this eventually to do the rest of the advanced
features.

> +static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain, struct device *dev)
> +{
> +	struct riscv_iommu_bond *bond, *found = NULL;
> +	unsigned long flags;
> +
> +	if (!domain)
> +		return;
> +
> +	spin_lock_irqsave(&domain->lock, flags);

This is never locked from an irq, you don't need to use the irqsave
variations.

> +	list_for_each_entry_rcu(bond, &domain->bonds, list) {
> +		if (bond->dev == dev) {
> +			list_del_rcu(&bond->list);
> +			found = bond;
> +		}
> +	}
> +	spin_unlock_irqrestore(&domain->lock, flags);
> +
> +	/* Release and wait for all read-rcu critical sections have completed. */
> +	kfree_rcu(found, rcu);
> +	synchronize_rcu();

Please no, synchronize_rcu() on a path like this is not so
reasonable.. Also you don't need kfree_rcu() if you write it like this.

This still looks better to do what I said before, put the iommu not
the dev in the bond struct.


> +static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
> +{
> +	struct riscv_iommu_domain *domain;
> +	struct riscv_iommu_device *iommu;
> +
> +	iommu = dev ? dev_to_iommu(dev) : NULL;
> +	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
> +	if (!domain)
> +		return ERR_PTR(-ENOMEM);
> +
> +	INIT_LIST_HEAD_RCU(&domain->bonds);
> +	spin_lock_init(&domain->lock);
> +	domain->numa_node = NUMA_NO_NODE;
> +
> +	/*
> +	 * Follow system address translation mode.
> +	 * RISC-V IOMMU ATP mode values match RISC-V CPU SATP mode values.
> +	 */
> +	domain->pgd_mode = satp_mode >> SATP_MODE_SHIFT;

This seems really strange, the iommu paging domains should be
unrelated to what the CPU is doing. There is no connection between
these two concepts.

Just pick a size that the iommu supports.

The number of radix levels is a tunable alot of iommus have that we
haven't really exposed to anything else yet.

> +	/*
> +	 * Note: RISC-V Privilege spec mandates that virtual addresses
> +	 * need to be sign-extended, so if (VA_BITS - 1) is set, all
> +	 * bits >= VA_BITS need to also be set or else we'll get a
> +	 * page fault. However the code that creates the mappings
> +	 * above us (e.g. iommu_dma_alloc_iova()) won't do that for us
> +	 * for now, so we'll end up with invalid virtual addresses
> +	 * to map. As a workaround until we get this sorted out
> +	 * limit the available virtual addresses to VA_BITS - 1.
> +	 */
> +	domain->domain.geometry.aperture_start = 0;
> +	domain->domain.geometry.aperture_end = DMA_BIT_MASK(VA_BITS - 1);
> +	domain->domain.geometry.force_aperture = true;

Yikes.. This is probably the best solution long term anyhow, unless
you need to use the last page in VFIO for some reason.
  
>  static int riscv_iommu_device_domain_type(struct device *dev)
>  {
> -	return IOMMU_DOMAIN_IDENTITY;
> +	struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> +
> +	if (iommu->ddt_mode == RISCV_IOMMU_DDTP_MODE_BARE)
> +		return IOMMU_DOMAIN_IDENTITY;
> +

Is there even a point of binding an iommu driver if the HW can't
support a DDT table? Just return -ENODEV from probe_device?

Logically a iommu block that can't decode the RID has no association
at all with a Linux struct device :)

Jason

Re: [PATCH v3 7/7] iommu/riscv: Paging domain support

Posted by Tomasz Jeznach 1 year, 9 months ago

On Wed, May 1, 2024 at 7:56 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Tue, Apr 30, 2024 at 01:01:57PM -0700, Tomasz Jeznach wrote:
>
> > +#define iommu_domain_to_riscv(iommu_domain) \
> > +     container_of(iommu_domain, struct riscv_iommu_domain, domain)
> > +
> > +#define dev_to_domain(dev) \
> > +     iommu_domain_to_riscv(dev_iommu_priv_get(dev))
>
> Please use the priv properly and put a struct around it, you'll
> certainly need this eventually to do the rest of the advanced
> features.
>

Done. Yes, indeed, I was going to introduce proper struct in follow up
patches anyway. Pulled this change sooner.

> > +static void riscv_iommu_bond_unlink(struct riscv_iommu_domain *domain, struct device *dev)
> > +{
> > +     struct riscv_iommu_bond *bond, *found = NULL;
> > +     unsigned long flags;
> > +
> > +     if (!domain)
> > +             return;
> > +
> > +     spin_lock_irqsave(&domain->lock, flags);
>
> This is never locked from an irq, you don't need to use the irqsave
> variations.
>

Good point. done in v4.

> > +     list_for_each_entry_rcu(bond, &domain->bonds, list) {
> > +             if (bond->dev == dev) {
> > +                     list_del_rcu(&bond->list);
> > +                     found = bond;
> > +             }
> > +     }
> > +     spin_unlock_irqrestore(&domain->lock, flags);
> > +
> > +     /* Release and wait for all read-rcu critical sections have completed. */
> > +     kfree_rcu(found, rcu);
> > +     synchronize_rcu();
>
> Please no, synchronize_rcu() on a path like this is not so
> reasonable.. Also you don't need kfree_rcu() if you write it like this.
>
> This still looks better to do what I said before, put the iommu not
> the dev in the bond struct.
>
>

I was trying not to duplicate data in bond struct and use whatever is
available to be referenced from dev pointer (eg iommu / ids / private
iommu dev data). If I'm reading core iommu code correctly, device
pointer and iommu pointers should be valid between _probe_device and
_release_device calls. I've moved synchronize_rcu out of the domain
attach path to _release_device, LMK if that would be ok for now.
I'll have a second another to rework other patches to avoid storing
dev pointers at all.


> > +static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
> > +{
> > +     struct riscv_iommu_domain *domain;
> > +     struct riscv_iommu_device *iommu;
> > +
> > +     iommu = dev ? dev_to_iommu(dev) : NULL;
> > +     domain = kzalloc(sizeof(*domain), GFP_KERNEL);
> > +     if (!domain)
> > +             return ERR_PTR(-ENOMEM);
> > +
> > +     INIT_LIST_HEAD_RCU(&domain->bonds);
> > +     spin_lock_init(&domain->lock);
> > +     domain->numa_node = NUMA_NO_NODE;
> > +
> > +     /*
> > +      * Follow system address translation mode.
> > +      * RISC-V IOMMU ATP mode values match RISC-V CPU SATP mode values.
> > +      */
> > +     domain->pgd_mode = satp_mode >> SATP_MODE_SHIFT;
>
> This seems really strange, the iommu paging domains should be
> unrelated to what the CPU is doing. There is no connection between
> these two concepts.
>
> Just pick a size that the iommu supports.
>
> The number of radix levels is a tunable alot of iommus have that we
> haven't really exposed to anything else yet.
>

Makes sense. I've left an option to pick mode from MMU for cases where
dev/iommu is not known at allocation time (with iommu_domain_alloc()).
I'd guess it's reasonable to assume IOMMU supported page modes will
match MMU.

> > +     /*
> > +      * Note: RISC-V Privilege spec mandates that virtual addresses
> > +      * need to be sign-extended, so if (VA_BITS - 1) is set, all
> > +      * bits >= VA_BITS need to also be set or else we'll get a
> > +      * page fault. However the code that creates the mappings
> > +      * above us (e.g. iommu_dma_alloc_iova()) won't do that for us
> > +      * for now, so we'll end up with invalid virtual addresses
> > +      * to map. As a workaround until we get this sorted out
> > +      * limit the available virtual addresses to VA_BITS - 1.
> > +      */
> > +     domain->domain.geometry.aperture_start = 0;
> > +     domain->domain.geometry.aperture_end = DMA_BIT_MASK(VA_BITS - 1);
> > +     domain->domain.geometry.force_aperture = true;
>
> Yikes.. This is probably the best solution long term anyhow, unless
> you need to use the last page in VFIO for some reason.
>
> >  static int riscv_iommu_device_domain_type(struct device *dev)
> >  {
> > -     return IOMMU_DOMAIN_IDENTITY;
> > +     struct riscv_iommu_device *iommu = dev_to_iommu(dev);
> > +
> > +     if (iommu->ddt_mode == RISCV_IOMMU_DDTP_MODE_BARE)
> > +             return IOMMU_DOMAIN_IDENTITY;
> > +
>
> Is there even a point of binding an iommu driver if the HW can't
> support a DDT table? Just return -ENODEV from probe_device?
>
> Logically a iommu block that can't decode the RID has no association
> at all with a Linux struct device :)
>

Done. Good point ;)

Thanks for review,
- Tomasz


> Jason

Re: [PATCH v3 7/7] iommu/riscv: Paging domain support

Posted by Jason Gunthorpe 1 year, 9 months ago

On Fri, May 03, 2024 at 10:44:14AM -0700, Tomasz Jeznach wrote:
> > > +     list_for_each_entry_rcu(bond, &domain->bonds, list) {
> > > +             if (bond->dev == dev) {
> > > +                     list_del_rcu(&bond->list);
> > > +                     found = bond;
> > > +             }
> > > +     }
> > > +     spin_unlock_irqrestore(&domain->lock, flags);
> > > +
> > > +     /* Release and wait for all read-rcu critical sections have completed. */
> > > +     kfree_rcu(found, rcu);
> > > +     synchronize_rcu();
> >
> > Please no, synchronize_rcu() on a path like this is not so
> > reasonable.. Also you don't need kfree_rcu() if you write it like this.
> >
> > This still looks better to do what I said before, put the iommu not
> > the dev in the bond struct.
> >
> >
> 
> I was trying not to duplicate data in bond struct and use whatever is
> available to be referenced from dev pointer (eg iommu / ids / private
> iommu dev data).

I'm not sure that is a valuable goal considering the RCU
complexity.. But I suppose it would be a bit of a hassle to replicate
the ids list into bond structurs. Maybe something to do when you get
to ATS since you'll probably want to replicate the ATS RIDs. (see what
Intel did, which I think is pretty good)

> If I'm reading core iommu code correctly, device pointer and iommu
> pointers should be valid between _probe_device and _release_device
> calls. I've moved synchronize_rcu out of the domain attach path to
> _release_device, LMK if that would be ok for now.  I'll have a
> second another to rework other patches to avoid storing dev pointers
> at all.

Yes, that seems better.. I'm happier to see device hot-unplug be slow
than un attach

There is another issue with the RCU that I haven't wrapped my head
around..

Technically we can have concurrent map/unmap/invalidation along side
device attach/detach. Does the invalidation under RCU work correctly?

For detach I think yes:

   Inv CPU                                   Detach CPU

  write io_pte                               Update device descriptor
  rcu_read_lock
    list_for_each
      <make invalidation command>            <make description inv cmd>
      dma_wmb()                              dma_wmb()
      <doorbell>                             <cmd doorbell>
  rcu_read_unlock
                                             list_del_rcu()
                                             <wipe ASID>

In this case I think we never miss an invalidation, the list_del is
always after the HW has been fully fenced, so I don't think we can
have any issue. Maybe a suprious invalidation if the ASID gets
re-used, but who cares.

Attach is different..

   Inv CPU                                   Attach CPU

  write io_pte
  rcu_read_lock
    list_for_each // empty
                                             list_add_rcu()
                                             Update device descriptor
                                             <make description inv cmd>
                                             dma_wmb()
                                             <cmd doorbell>
  rcu_read_unlock

As above shows we can "miss" an invalidation. The issue is narrow, the
io_pte could still be sitting in write buffers in "Inv CPU" and not
yet globally visiable. "Attach CPU" could get the device descriptor
installed in the IOMMU and the IOMMU could walk an io_pte that is in
the old state. Effectively this is because there is no release/acquire
barrier passing the io_pte store from the Inv CPU to the Attach CPU to the
IOMMU.

It seems like it should be solvable somehow:
 1) Inv CPU releases all the io ptes
 2) Attach CPU acquires the io ptes before updating the DDT
 3) Inv CPU acquires the RCU list in such a way that either attach
    CPU will acquire the io_pte or inv CPU will acquire the RCU list.
 4) Either invalidation works or we release the new iopte to the SMMU
    and don't need it.

But #3 is a really weird statement. smb_mb() on both sides may do the
job??

> > The number of radix levels is a tunable alot of iommus have that we
> > haven't really exposed to anything else yet.
> 
> Makes sense. I've left an option to pick mode from MMU for cases where
> dev/iommu is not known at allocation time (with iommu_domain_alloc()).
> I'd guess it's reasonable to assume IOMMU supported page modes will
> match MMU.

Reasonable, but for this case you'd be best to have a global static
that unifies the capability of all the iommu instances. Then you can
pick the right thing from the installed iommus, and arguably, that is
the right thing to do in all cases as we'd slightly prefer domains
that work everywhere in that edge case.

Jason

Re: [PATCH v3 7/7] iommu/riscv: Paging domain support

Posted by Tomasz Jeznach 1 year, 9 months ago

On Fri, May 3, 2024 at 11:11 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Fri, May 03, 2024 at 10:44:14AM -0700, Tomasz Jeznach wrote:
> > > > +     list_for_each_entry_rcu(bond, &domain->bonds, list) {
> > > > +             if (bond->dev == dev) {
> > > > +                     list_del_rcu(&bond->list);
> > > > +                     found = bond;
> > > > +             }
> > > > +     }
> > > > +     spin_unlock_irqrestore(&domain->lock, flags);
> > > > +
> > > > +     /* Release and wait for all read-rcu critical sections have completed. */
> > > > +     kfree_rcu(found, rcu);
> > > > +     synchronize_rcu();
> > >
> > > Please no, synchronize_rcu() on a path like this is not so
> > > reasonable.. Also you don't need kfree_rcu() if you write it like this.
> > >
> > > This still looks better to do what I said before, put the iommu not
> > > the dev in the bond struct.
> > >
> > >
> >
> > I was trying not to duplicate data in bond struct and use whatever is
> > available to be referenced from dev pointer (eg iommu / ids / private
> > iommu dev data).
>
> I'm not sure that is a valuable goal considering the RCU
> complexity.. But I suppose it would be a bit of a hassle to replicate
> the ids list into bond structurs. Maybe something to do when you get
> to ATS since you'll probably want to replicate the ATS RIDs. (see what
> Intel did, which I think is pretty good)
>
> > If I'm reading core iommu code correctly, device pointer and iommu
> > pointers should be valid between _probe_device and _release_device
> > calls. I've moved synchronize_rcu out of the domain attach path to
> > _release_device, LMK if that would be ok for now.  I'll have a
> > second another to rework other patches to avoid storing dev pointers
> > at all.
>
> Yes, that seems better.. I'm happier to see device hot-unplug be slow
> than un attach
>
> There is another issue with the RCU that I haven't wrapped my head
> around..
>
> Technically we can have concurrent map/unmap/invalidation along side
> device attach/detach. Does the invalidation under RCU work correctly?
>
> For detach I think yes:
>
>    Inv CPU                                   Detach CPU
>
>   write io_pte                               Update device descriptor
>   rcu_read_lock
>     list_for_each
>       <make invalidation command>            <make description inv cmd>
>       dma_wmb()                              dma_wmb()
>       <doorbell>                             <cmd doorbell>
>   rcu_read_unlock
>                                              list_del_rcu()
>                                              <wipe ASID>
>
> In this case I think we never miss an invalidation, the list_del is
> always after the HW has been fully fenced, so I don't think we can
> have any issue. Maybe a suprious invalidation if the ASID gets
> re-used, but who cares.
>
> Attach is different..
>
>    Inv CPU                                   Attach CPU
>
>   write io_pte
>   rcu_read_lock
>     list_for_each // empty
>                                              list_add_rcu()
>                                              Update device descriptor
>                                              <make description inv cmd>
>                                              dma_wmb()
>                                              <cmd doorbell>
>   rcu_read_unlock
>
> As above shows we can "miss" an invalidation. The issue is narrow, the
> io_pte could still be sitting in write buffers in "Inv CPU" and not
> yet globally visiable. "Attach CPU" could get the device descriptor
> installed in the IOMMU and the IOMMU could walk an io_pte that is in
> the old state. Effectively this is because there is no release/acquire
> barrier passing the io_pte store from the Inv CPU to the Attach CPU to the
> IOMMU.
>
> It seems like it should be solvable somehow:
>  1) Inv CPU releases all the io ptes
>  2) Attach CPU acquires the io ptes before updating the DDT
>  3) Inv CPU acquires the RCU list in such a way that either attach
>     CPU will acquire the io_pte or inv CPU will acquire the RCU list.
>  4) Either invalidation works or we release the new iopte to the SMMU
>     and don't need it.
>
> But #3 is a really weird statement. smb_mb() on both sides may do the
> job??
>

Actual attach sequence is slightly different.

 Inv CPU                            Attach CPU

 write io_pte
  rcu_read_lock
    list_for_each // empty
                                    list_add_rcu()
                                    IOTLB.INVAL(PSCID)
                                    <make description inv cmd>
                                    dma_wmb()
                                    <cmd doorbell>
 rcu_read_unlock

I've tried to cover this case with riscv_iommu_iotlb_inval() called
before the attached domain is visible to the device. This is something
to optimize to avoid invalidating the whole PSCID if the domain was
already attached to the same IOMMU, but I'd leave it for a separate
patch series.

> > > The number of radix levels is a tunable alot of iommus have that we
> > > haven't really exposed to anything else yet.
> >
> > Makes sense. I've left an option to pick mode from MMU for cases where
> > dev/iommu is not known at allocation time (with iommu_domain_alloc()).
> > I'd guess it's reasonable to assume IOMMU supported page modes will
> > match MMU.
>
> Reasonable, but for this case you'd be best to have a global static
> that unifies the capability of all the iommu instances. Then you can
> pick the right thing from the installed iommus, and arguably, that is
> the right thing to do in all cases as we'd slightly prefer domains
> that work everywhere in that edge case.
>

That is a viable option. If you're ok I'll address this as a separate patch.
I've been trying to avoid adding more global variables, but it might
not be avoidable.

> Jason

Best,
- Tomasz

Re: [PATCH v3 7/7] iommu/riscv: Paging domain support

Posted by Jason Gunthorpe 1 year, 9 months ago

On Fri, May 03, 2024 at 12:44:09PM -0700, Tomasz Jeznach wrote:
> > For detach I think yes:
> >
> >    Inv CPU                                   Detach CPU
> >
> >   write io_pte                               Update device descriptor
> >   rcu_read_lock
> >     list_for_each
> >       <make invalidation command>            <make description inv cmd>
> >       dma_wmb()                              dma_wmb()
> >       <doorbell>                             <cmd doorbell>
> >   rcu_read_unlock
> >                                              list_del_rcu()
> >                                              <wipe ASID>
> >
> > In this case I think we never miss an invalidation, the list_del is
> > always after the HW has been fully fenced, so I don't think we can
> > have any issue. Maybe a suprious invalidation if the ASID gets
> > re-used, but who cares.
> >
> > Attach is different..
> >
> >    Inv CPU                                   Attach CPU
> >
> >   write io_pte
> >   rcu_read_lock
> >     list_for_each // empty
> >                                              list_add_rcu()
> >                                              Update device descriptor
> >                                              <make description inv cmd>
> >                                              dma_wmb()
> >                                              <cmd doorbell>
> >   rcu_read_unlock
> >
> > As above shows we can "miss" an invalidation. The issue is narrow, the
> > io_pte could still be sitting in write buffers in "Inv CPU" and not
> > yet globally visiable. "Attach CPU" could get the device descriptor
> > installed in the IOMMU and the IOMMU could walk an io_pte that is in
> > the old state. Effectively this is because there is no release/acquire
> > barrier passing the io_pte store from the Inv CPU to the Attach CPU to the
> > IOMMU.
> >
> > It seems like it should be solvable somehow:
> >  1) Inv CPU releases all the io ptes
> >  2) Attach CPU acquires the io ptes before updating the DDT
> >  3) Inv CPU acquires the RCU list in such a way that either attach
> >     CPU will acquire the io_pte or inv CPU will acquire the RCU list.
> >  4) Either invalidation works or we release the new iopte to the SMMU
> >     and don't need it.
> >
> > But #3 is a really weird statement. smb_mb() on both sides may do the
> > job??
> >
> 
> Actual attach sequence is slightly different.
> 
>  Inv CPU                            Attach CPU
> 
>  write io_pte
>   rcu_read_lock
>     list_for_each // empty
>                                     list_add_rcu()
>                                     IOTLB.INVAL(PSCID)
>                                     <make description inv cmd>
>                                     dma_wmb()
>                                     <cmd doorbell>
>  rcu_read_unlock
> 
> I've tried to cover this case with riscv_iommu_iotlb_inval() called
> before the attached domain is visible to the device.

That invalidation shouldn't do anything. If this is the first attach
of a PSCID then the PSCID had better already be empty, it won't become
non-empty until the DDT entry is installed.

And if it is the second attach then the Inv CPU is already taking care
of things, no need to invalidate at all.

Regardless, there is still a theortical race that the IOPTEs haven't
been made visible yet because there is still no synchronization with
the CPU writing them.

So, I don't think this solves any problem. I belive you need the
appropriate kind of CPU barrier here instead of an invalidation.

Jason

Re: [PATCH v3 7/7] iommu/riscv: Paging domain support

Posted by Tomasz Jeznach 1 year, 9 months ago

On Sun, May 5, 2024 at 8:46 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Fri, May 03, 2024 at 12:44:09PM -0700, Tomasz Jeznach wrote:
> > > For detach I think yes:
> > >
> > >    Inv CPU                                   Detach CPU
> > >
> > >   write io_pte                               Update device descriptor
> > >   rcu_read_lock
> > >     list_for_each
> > >       <make invalidation command>            <make description inv cmd>
> > >       dma_wmb()                              dma_wmb()
> > >       <doorbell>                             <cmd doorbell>
> > >   rcu_read_unlock
> > >                                              list_del_rcu()
> > >                                              <wipe ASID>
> > >
> > > In this case I think we never miss an invalidation, the list_del is
> > > always after the HW has been fully fenced, so I don't think we can
> > > have any issue. Maybe a suprious invalidation if the ASID gets
> > > re-used, but who cares.
> > >
> > > Attach is different..
> > >
> > >    Inv CPU                                   Attach CPU
> > >
> > >   write io_pte
> > >   rcu_read_lock
> > >     list_for_each // empty
> > >                                              list_add_rcu()
> > >                                              Update device descriptor
> > >                                              <make description inv cmd>
> > >                                              dma_wmb()
> > >                                              <cmd doorbell>
> > >   rcu_read_unlock
> > >
> > > As above shows we can "miss" an invalidation. The issue is narrow, the
> > > io_pte could still be sitting in write buffers in "Inv CPU" and not
> > > yet globally visiable. "Attach CPU" could get the device descriptor
> > > installed in the IOMMU and the IOMMU could walk an io_pte that is in
> > > the old state. Effectively this is because there is no release/acquire
> > > barrier passing the io_pte store from the Inv CPU to the Attach CPU to the
> > > IOMMU.
> > >
> > > It seems like it should be solvable somehow:
> > >  1) Inv CPU releases all the io ptes
> > >  2) Attach CPU acquires the io ptes before updating the DDT
> > >  3) Inv CPU acquires the RCU list in such a way that either attach
> > >     CPU will acquire the io_pte or inv CPU will acquire the RCU list.
> > >  4) Either invalidation works or we release the new iopte to the SMMU
> > >     and don't need it.
> > >
> > > But #3 is a really weird statement. smb_mb() on both sides may do the
> > > job??
> > >
> >
> > Actual attach sequence is slightly different.
> >
> >  Inv CPU                            Attach CPU
> >
> >  write io_pte
> >   rcu_read_lock
> >     list_for_each // empty
> >                                     list_add_rcu()
> >                                     IOTLB.INVAL(PSCID)
> >                                     <make description inv cmd>
> >                                     dma_wmb()
> >                                     <cmd doorbell>
> >  rcu_read_unlock
> >
> > I've tried to cover this case with riscv_iommu_iotlb_inval() called
> > before the attached domain is visible to the device.
>
> That invalidation shouldn't do anything. If this is the first attach
> of a PSCID then the PSCID had better already be empty, it won't become
> non-empty until the DDT entry is installed.
>
> And if it is the second attach then the Inv CPU is already taking care
> of things, no need to invalidate at all.
>
> Regardless, there is still a theortical race that the IOPTEs haven't
> been made visible yet because there is still no synchronization with
> the CPU writing them.
>
> So, I don't think this solves any problem. I belive you need the
> appropriate kind of CPU barrier here instead of an invalidation.
>

Yes. There was a small, but still plausible race w/ IOPTEs visibility
to the IOMMU.
For v5 I'm adding two barriers to the inval/detach flow, I believe
should cover it.

1) In riscv_iommu_iotlb_inval() unconditional dma_wmb() to make any
pending writes to PTEs visible to the IOMMU device. This should cover
the case when list_add_rcu() update is not yet visible in the
_iotlb_inval() sequence, for the first time the domain is attached to
the IOMMU.

  Inv CPU                                    Attach CPU
  write io_pte
  dma_wmb (1)
  rcu_read_lock
    list_for_each // empty                   list_add_rcu()
                                             smp_wmb (2)
                                             Update device descriptor
                                             <make description inv cmd>
                                             // PTEs are visible to the HW (*1)
                                             dma_wmb()
                                             <cmd doorbell>
  rcu_read_unlock

2) In riscv_iommu_bond_link() write memory barrier to ensure list
update is visible before IOMMU descriptor update. If stale data has
been fetched by the HW, inval CPU will run iotlb-invalidation
sequence. There is a possibility that IOMMU will fetch correct PTEs
and will receive unnecessary IOTLB inval, but I don't think anyone
would care.

  Inv CPU                                    Attach CPU
  write io_pte                               list_add_rcu()
                                             smp_wmb (2)
                                             Update device descriptor
                                             <make description inv cmd>
                                             // HW might fetch stale PTEs
                                             dma_wmb()
                                             <cmd doorbell>
  dma_wmb (1)
  rcu_read_lock
    list_for_each // non-empty (*2)
      <make iotlb inval cmd>
      dma_wmb()
      <cmd doorbell>
  rcu_read_unlock

3) I've also updated riscv_iommu_bond_unlink() to wipe the PSCID cache
on the last domain unlink from the IOMMU.

Thank you for pointing this out. Let me know if that makes sense.

Best,
- Tomasz

> Jason

Re: [PATCH v3 7/7] iommu/riscv: Paging domain support

Posted by Jason Gunthorpe 1 year, 9 months ago

On Mon, May 06, 2024 at 07:22:07PM -0700, Tomasz Jeznach wrote:
> On Sun, May 5, 2024 at 8:46 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> >
> > On Fri, May 03, 2024 at 12:44:09PM -0700, Tomasz Jeznach wrote:
> > > > For detach I think yes:
> > > >
> > > >    Inv CPU                                   Detach CPU
> > > >
> > > >   write io_pte                               Update device descriptor
> > > >   rcu_read_lock
> > > >     list_for_each
> > > >       <make invalidation command>            <make description inv cmd>
> > > >       dma_wmb()                              dma_wmb()
> > > >       <doorbell>                             <cmd doorbell>
> > > >   rcu_read_unlock
> > > >                                              list_del_rcu()
> > > >                                              <wipe ASID>
> > > >
> > > > In this case I think we never miss an invalidation, the list_del is
> > > > always after the HW has been fully fenced, so I don't think we can
> > > > have any issue. Maybe a suprious invalidation if the ASID gets
> > > > re-used, but who cares.
> > > >
> > > > Attach is different..
> > > >
> > > >    Inv CPU                                   Attach CPU
> > > >
> > > >   write io_pte
> > > >   rcu_read_lock
> > > >     list_for_each // empty
> > > >                                              list_add_rcu()
> > > >                                              Update device descriptor
> > > >                                              <make description inv cmd>
> > > >                                              dma_wmb()
> > > >                                              <cmd doorbell>
> > > >   rcu_read_unlock
> > > >
> > > > As above shows we can "miss" an invalidation. The issue is narrow, the
> > > > io_pte could still be sitting in write buffers in "Inv CPU" and not
> > > > yet globally visiable. "Attach CPU" could get the device descriptor
> > > > installed in the IOMMU and the IOMMU could walk an io_pte that is in
> > > > the old state. Effectively this is because there is no release/acquire
> > > > barrier passing the io_pte store from the Inv CPU to the Attach CPU to the
> > > > IOMMU.
> > > >
> > > > It seems like it should be solvable somehow:
> > > >  1) Inv CPU releases all the io ptes
> > > >  2) Attach CPU acquires the io ptes before updating the DDT
> > > >  3) Inv CPU acquires the RCU list in such a way that either attach
> > > >     CPU will acquire the io_pte or inv CPU will acquire the RCU list.
> > > >  4) Either invalidation works or we release the new iopte to the SMMU
> > > >     and don't need it.
> > > >
> > > > But #3 is a really weird statement. smb_mb() on both sides may do the
> > > > job??
> > > >
> > >
> > > Actual attach sequence is slightly different.
> > >
> > >  Inv CPU                            Attach CPU
> > >
> > >  write io_pte
> > >   rcu_read_lock
> > >     list_for_each // empty
> > >                                     list_add_rcu()
> > >                                     IOTLB.INVAL(PSCID)
> > >                                     <make description inv cmd>
> > >                                     dma_wmb()
> > >                                     <cmd doorbell>
> > >  rcu_read_unlock
> > >
> > > I've tried to cover this case with riscv_iommu_iotlb_inval() called
> > > before the attached domain is visible to the device.
> >
> > That invalidation shouldn't do anything. If this is the first attach
> > of a PSCID then the PSCID had better already be empty, it won't become
> > non-empty until the DDT entry is installed.
> >
> > And if it is the second attach then the Inv CPU is already taking care
> > of things, no need to invalidate at all.
> >
> > Regardless, there is still a theortical race that the IOPTEs haven't
> > been made visible yet because there is still no synchronization with
> > the CPU writing them.
> >
> > So, I don't think this solves any problem. I belive you need the
> > appropriate kind of CPU barrier here instead of an invalidation.
> >
> 
> Yes. There was a small, but still plausible race w/ IOPTEs visibility
> to the IOMMU.
> For v5 I'm adding two barriers to the inval/detach flow, I believe
> should cover it.
> 
> 1) In riscv_iommu_iotlb_inval() unconditional dma_wmb() to make any
> pending writes to PTEs visible to the IOMMU device. This should cover
> the case when list_add_rcu() update is not yet visible in the
> _iotlb_inval() sequence, for the first time the domain is attached to
> the IOMMU.
> 
>   Inv CPU                                    Attach CPU
>   write io_pte
>   dma_wmb (1)
>   rcu_read_lock
>     list_for_each // empty                   list_add_rcu()
>                                              smp_wmb (2)
>                                              Update device descriptor
>                                              <make description inv cmd>
>                                              // PTEs are visible to the HW (*1)
>                                              dma_wmb()
>                                              <cmd doorbell>
>   rcu_read_unlock
> 
> 2) In riscv_iommu_bond_link() write memory barrier to ensure list
> update is visible before IOMMU descriptor update. If stale data has
> been fetched by the HW, inval CPU will run iotlb-invalidation
> sequence. There is a possibility that IOMMU will fetch correct PTEs
> and will receive unnecessary IOTLB inval, but I don't think anyone
> would care.
> 
>   Inv CPU                                    Attach CPU
>   write io_pte                               list_add_rcu()
>                                              smp_wmb (2)
>                                              Update device descriptor
>                                              <make description inv cmd>
>                                              // HW might fetch stale PTEs
>                                              dma_wmb()
>                                              <cmd doorbell>
>   dma_wmb (1)
>   rcu_read_lock
>     list_for_each // non-empty (*2)
>       <make iotlb inval cmd>
>       dma_wmb()
>       <cmd doorbell>
>   rcu_read_unlock
> 
> 3) I've also updated riscv_iommu_bond_unlink() to wipe the PSCID cache
> on the last domain unlink from the IOMMU.
> 
> Thank you for pointing this out. Let me know if that makes sense.

I'm not an expert in barriers, but I think you need the more expensive
"mb" in both cases.

The inv side is both releasing the write and acquiring the list
read. IIRC READ_ONCE is not a full acquire?

The Attach side is both releasing the list_add_rcu() and acquiring the
iopte.

rcu is still a benefit, there is no cache line sharing and there is
only one full barrier, not two, like a spinlock.

And a big fat comment in both sides explaining this :)

Jason

Re: [PATCH v3 7/7] iommu/riscv: Paging domain support

Posted by Tomasz Jeznach 1 year, 9 months ago

On Tue, May 7, 2024 at 9:51 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
>
> On Mon, May 06, 2024 at 07:22:07PM -0700, Tomasz Jeznach wrote:
> > On Sun, May 5, 2024 at 8:46 AM Jason Gunthorpe <jgg@ziepe.ca> wrote:
> > >
> > > On Fri, May 03, 2024 at 12:44:09PM -0700, Tomasz Jeznach wrote:
> > > > > For detach I think yes:
> > > > >
> > > > >    Inv CPU                                   Detach CPU
> > > > >
> > > > >   write io_pte                               Update device descriptor
> > > > >   rcu_read_lock
> > > > >     list_for_each
> > > > >       <make invalidation command>            <make description inv cmd>
> > > > >       dma_wmb()                              dma_wmb()
> > > > >       <doorbell>                             <cmd doorbell>
> > > > >   rcu_read_unlock
> > > > >                                              list_del_rcu()
> > > > >                                              <wipe ASID>
> > > > >
> > > > > In this case I think we never miss an invalidation, the list_del is
> > > > > always after the HW has been fully fenced, so I don't think we can
> > > > > have any issue. Maybe a suprious invalidation if the ASID gets
> > > > > re-used, but who cares.
> > > > >
> > > > > Attach is different..
> > > > >
> > > > >    Inv CPU                                   Attach CPU
> > > > >
> > > > >   write io_pte
> > > > >   rcu_read_lock
> > > > >     list_for_each // empty
> > > > >                                              list_add_rcu()
> > > > >                                              Update device descriptor
> > > > >                                              <make description inv cmd>
> > > > >                                              dma_wmb()
> > > > >                                              <cmd doorbell>
> > > > >   rcu_read_unlock
> > > > >
> > > > > As above shows we can "miss" an invalidation. The issue is narrow, the
> > > > > io_pte could still be sitting in write buffers in "Inv CPU" and not
> > > > > yet globally visiable. "Attach CPU" could get the device descriptor
> > > > > installed in the IOMMU and the IOMMU could walk an io_pte that is in
> > > > > the old state. Effectively this is because there is no release/acquire
> > > > > barrier passing the io_pte store from the Inv CPU to the Attach CPU to the
> > > > > IOMMU.
> > > > >
> > > > > It seems like it should be solvable somehow:
> > > > >  1) Inv CPU releases all the io ptes
> > > > >  2) Attach CPU acquires the io ptes before updating the DDT
> > > > >  3) Inv CPU acquires the RCU list in such a way that either attach
> > > > >     CPU will acquire the io_pte or inv CPU will acquire the RCU list.
> > > > >  4) Either invalidation works or we release the new iopte to the SMMU
> > > > >     and don't need it.
> > > > >
> > > > > But #3 is a really weird statement. smb_mb() on both sides may do the
> > > > > job??
> > > > >
> > > >
> > > > Actual attach sequence is slightly different.
> > > >
> > > >  Inv CPU                            Attach CPU
> > > >
> > > >  write io_pte
> > > >   rcu_read_lock
> > > >     list_for_each // empty
> > > >                                     list_add_rcu()
> > > >                                     IOTLB.INVAL(PSCID)
> > > >                                     <make description inv cmd>
> > > >                                     dma_wmb()
> > > >                                     <cmd doorbell>
> > > >  rcu_read_unlock
> > > >
> > > > I've tried to cover this case with riscv_iommu_iotlb_inval() called
> > > > before the attached domain is visible to the device.
> > >
> > > That invalidation shouldn't do anything. If this is the first attach
> > > of a PSCID then the PSCID had better already be empty, it won't become
> > > non-empty until the DDT entry is installed.
> > >
> > > And if it is the second attach then the Inv CPU is already taking care
> > > of things, no need to invalidate at all.
> > >
> > > Regardless, there is still a theortical race that the IOPTEs haven't
> > > been made visible yet because there is still no synchronization with
> > > the CPU writing them.
> > >
> > > So, I don't think this solves any problem. I belive you need the
> > > appropriate kind of CPU barrier here instead of an invalidation.
> > >
> >
> > Yes. There was a small, but still plausible race w/ IOPTEs visibility
> > to the IOMMU.
> > For v5 I'm adding two barriers to the inval/detach flow, I believe
> > should cover it.
> >
> > 1) In riscv_iommu_iotlb_inval() unconditional dma_wmb() to make any
> > pending writes to PTEs visible to the IOMMU device. This should cover
> > the case when list_add_rcu() update is not yet visible in the
> > _iotlb_inval() sequence, for the first time the domain is attached to
> > the IOMMU.
> >
> >   Inv CPU                                    Attach CPU
> >   write io_pte
> >   dma_wmb (1)
> >   rcu_read_lock
> >     list_for_each // empty                   list_add_rcu()
> >                                              smp_wmb (2)
> >                                              Update device descriptor
> >                                              <make description inv cmd>
> >                                              // PTEs are visible to the HW (*1)
> >                                              dma_wmb()
> >                                              <cmd doorbell>
> >   rcu_read_unlock
> >
> > 2) In riscv_iommu_bond_link() write memory barrier to ensure list
> > update is visible before IOMMU descriptor update. If stale data has
> > been fetched by the HW, inval CPU will run iotlb-invalidation
> > sequence. There is a possibility that IOMMU will fetch correct PTEs
> > and will receive unnecessary IOTLB inval, but I don't think anyone
> > would care.
> >
> >   Inv CPU                                    Attach CPU
> >   write io_pte                               list_add_rcu()
> >                                              smp_wmb (2)
> >                                              Update device descriptor
> >                                              <make description inv cmd>
> >                                              // HW might fetch stale PTEs
> >                                              dma_wmb()
> >                                              <cmd doorbell>
> >   dma_wmb (1)
> >   rcu_read_lock
> >     list_for_each // non-empty (*2)
> >       <make iotlb inval cmd>
> >       dma_wmb()
> >       <cmd doorbell>
> >   rcu_read_unlock
> >
> > 3) I've also updated riscv_iommu_bond_unlink() to wipe the PSCID cache
> > on the last domain unlink from the IOMMU.
> >
> > Thank you for pointing this out. Let me know if that makes sense.
>
> I'm not an expert in barriers, but I think you need the more expensive
> "mb" in both cases.
>
> The inv side is both releasing the write and acquiring the list
> read. IIRC READ_ONCE is not a full acquire?
>
> The Attach side is both releasing the list_add_rcu() and acquiring the
> iopte.
>
> rcu is still a benefit, there is no cache line sharing and there is
> only one full barrier, not two, like a spinlock.
>
> And a big fat comment in both sides explaining this :)
>

I'm not an expert in barriers as well, but I've checked offline with one ;)

And the conclusion is that we need FENCE W,W (or stronger) on the
attach side, and FENCE RW,RW in the invalidation sequence.  Hardware
access to PTE/DC is sequential, with implied FENCE R,R barriers.

As 'attach' sequence is a rare event anyway, I'll go on "mb" on both
sides, as suggested.
Unless there are other opinions on that I'll update barriers to match
this conclusion and try to capture in long comment for bond / inval /
attach synchronization assumptions.

Jason, thanks again for pointing this out.


> Jason

Best,
- Tomasz