Now, each smmu_domain is built with an invs array that keeps all the IDs
(asid/vmid) and its attached device SIDs, following the exact pattern of
all the existing invalidation functions.
Introduce a new arm_smmu_domain_inv helper iterating smmu_domain->invs,
to convert the invalidation array to commands. Any invalidation request
with no size specified means an entire flush over a range based one.
Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 9 +
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 215 ++++++++++++++++++--
2 files changed, 211 insertions(+), 13 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 715179249eced..69271beb54527 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -1060,6 +1060,15 @@ void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid,
int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
unsigned long iova, size_t size);
+void arm_smmu_domain_inv_range(struct arm_smmu_domain *smmu_domain,
+ unsigned long iova, size_t size,
+ unsigned int granule, bool leaf);
+
+static inline void arm_smmu_domain_inv(struct arm_smmu_domain *smmu_domain)
+{
+ arm_smmu_domain_inv_range(smmu_domain, 0, 0, 0, false);
+}
+
void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
struct arm_smmu_cmdq *cmdq);
int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 95615525b0ab8..aa770275029e2 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2530,23 +2530,19 @@ static void arm_smmu_tlb_inv_context(void *cookie)
arm_smmu_atc_inv_domain(smmu_domain, 0, 0);
}
-static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
- unsigned long iova, size_t size,
- size_t granule,
- struct arm_smmu_domain *smmu_domain)
+static void arm_smmu_cmdq_batch_add_range(struct arm_smmu_device *smmu,
+ struct arm_smmu_cmdq_batch *cmds,
+ struct arm_smmu_cmdq_ent *cmd,
+ unsigned long iova, size_t size,
+ size_t granule, size_t pgsize)
{
- struct arm_smmu_device *smmu = smmu_domain->smmu;
- unsigned long end = iova + size, num_pages = 0, tg = 0;
+ unsigned long end = iova + size, num_pages = 0, tg = pgsize;
size_t inv_range = granule;
- struct arm_smmu_cmdq_batch cmds;
if (!size)
return;
if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
- /* Get the leaf page size */
- tg = __ffs(smmu_domain->domain.pgsize_bitmap);
-
num_pages = size >> tg;
/* Convert page size of 12,14,16 (log2) to 1,2,3 */
@@ -2566,8 +2562,6 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
num_pages++;
}
- arm_smmu_cmdq_batch_init(smmu, &cmds, cmd);
-
while (iova < end) {
if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
/*
@@ -2595,9 +2589,26 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
}
cmd->tlbi.addr = iova;
- arm_smmu_cmdq_batch_add(smmu, &cmds, cmd);
+ arm_smmu_cmdq_batch_add(smmu, cmds, cmd);
iova += inv_range;
}
+}
+
+static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
+ unsigned long iova, size_t size,
+ size_t granule,
+ struct arm_smmu_domain *smmu_domain)
+{
+ struct arm_smmu_device *smmu = smmu_domain->smmu;
+ struct arm_smmu_cmdq_batch cmds;
+ size_t pgsize;
+
+ /* Get the leaf page size */
+ pgsize = __ffs(smmu_domain->domain.pgsize_bitmap);
+
+ arm_smmu_cmdq_batch_init(smmu, &cmds, cmd);
+ arm_smmu_cmdq_batch_add_range(smmu, &cmds, cmd, iova, size, granule,
+ pgsize);
arm_smmu_cmdq_batch_submit(smmu, &cmds);
}
@@ -2653,6 +2664,184 @@ void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid,
__arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
}
+static bool arm_smmu_inv_size_too_big(struct arm_smmu_device *smmu, size_t size,
+ size_t granule)
+{
+ size_t max_tlbi_ops;
+
+ /* 0 size means invalidate all */
+ if (!size || size == SIZE_MAX)
+ return true;
+
+ if (smmu->features & ARM_SMMU_FEAT_RANGE_INV)
+ return false;
+
+ /*
+ * Borrowed from the MAX_TLBI_OPS in arch/arm64/include/asm/tlbflush.h,
+ * this is used as a threshold to replace "size_opcode" commands with a
+ * single "nsize_opcode" command, when SMMU doesn't implement the range
+ * invalidation feature, where there can be too many per-granule TLBIs,
+ * resulting in a soft lockup.
+ */
+ max_tlbi_ops = 1 << (ilog2(granule) - 3);
+ return size >= max_tlbi_ops * granule;
+}
+
+/* Used by non INV_TYPE_ATS* invalidations */
+static void arm_smmu_inv_to_cmdq_batch(struct arm_smmu_inv *inv,
+ struct arm_smmu_cmdq_batch *cmds,
+ struct arm_smmu_cmdq_ent *cmd,
+ unsigned long iova, size_t size,
+ unsigned int granule)
+{
+ if (arm_smmu_inv_size_too_big(inv->smmu, size, granule)) {
+ cmd->opcode = inv->nsize_opcode;
+ /* nsize_opcode always needs a sync, no batching */
+ arm_smmu_cmdq_issue_cmd_with_sync(inv->smmu, cmd);
+ return;
+ }
+
+ cmd->opcode = inv->size_opcode;
+ arm_smmu_cmdq_batch_add_range(inv->smmu, cmds, cmd, iova, size, granule,
+ inv->pgsize);
+}
+
+static bool arm_smmu_invs_end_batch(struct arm_smmu_invs *invs, size_t idx)
+{
+ struct arm_smmu_inv *cur = &invs->inv[idx];
+ struct arm_smmu_inv *next;
+
+ /* Last array entry ends */
+ if (idx + 1 == invs->num_invs)
+ return true;
+
+ next = &cur[1];
+ /* Changing smmu means changing command queue */
+ if (cur->smmu != next->smmu)
+ return true;
+ /* The batch for S2 TLBI must be done before nested S1 ASIDs */
+ if (next->type == INV_TYPE_S2_VMID_S1_CLEAR)
+ return true;
+ /* ATS must be after a sync of the S1/S2 invalidations */
+ if (cur->type != INV_TYPE_ATS && cur->type != INV_TYPE_ATS_FULL &&
+ (next->type == INV_TYPE_ATS || next->type == INV_TYPE_ATS_FULL))
+ return true;
+ return false;
+}
+
+void arm_smmu_domain_inv_range(struct arm_smmu_domain *smmu_domain,
+ unsigned long iova, size_t size,
+ unsigned int granule, bool leaf)
+{
+ struct arm_smmu_cmdq_batch cmds = {};
+ struct arm_smmu_invs *invs;
+ bool retried = false;
+ size_t i;
+
+ /*
+ * An invalidation request must follow some IOPTE change and then load
+ * the invalidation array In the meantime, a domain attachment mutates
+ * the array and then stores an STE/CD asking SMMU HW to acquire those
+ * changed IOPTEs. In other word, these two are interdependent and can
+ * race.
+ *
+ * In a race, the RCU design (with its underlying memory barriers) can
+ * ensure the invalidations array to always get updated before loaded.
+ *
+ * smp_mb() is used here, paired with the smp_mb() following the array
+ * update in a concurrent attach, to ensure:
+ * - HW sees the new IOPTEs if it walks after STE installation
+ * - Invalidation thread sees the updated array with the new ASID.
+ *
+ * [CPU0] | [CPU1]
+ * |
+ * change IOPTEs and TLB flush: |
+ * arm_smmu_domain_inv_range() { | arm_smmu_install_new_domain_invs {
+ * ... | rcu_assign_pointer(new_invs);
+ * smp_mb(); // ensure IOPTEs | smp_mb(); // ensure new_invs
+ * ... | kfree_rcu(old_invs, rcu);
+ * // load invalidation array | }
+ * invs = rcu_dereference(); | arm_smmu_install_ste_for_dev {
+ * | STE = TTB0 // read new IOPTEs
+ */
+ smp_mb();
+
+ rcu_read_lock();
+again:
+ invs = rcu_dereference(smmu_domain->invs);
+
+ /* A concurrent attachment might have changed the array. Do a respin */
+ if (unlikely(!read_trylock(&invs->rwlock)))
+ goto again;
+ /* Only one retry. Otherwise, it would soft lockup on an empty array */
+ if (!retried && unlikely(!invs->num_invs)) {
+ read_unlock(&invs->rwlock);
+ retried = true;
+ goto again;
+ }
+
+ for (i = 0; i < invs->num_invs; i++) {
+ struct arm_smmu_inv *cur = &invs->inv[i];
+ struct arm_smmu_device *smmu = cur->smmu;
+ struct arm_smmu_cmdq_ent cmd = {
+ /*
+ * Pick size_opcode to run arm_smmu_get_cmdq(). This can
+ * be changed to nsize_opcode, which would result in the
+ * same CMDQ pointer.
+ */
+ .opcode = cur->size_opcode,
+ };
+
+ /* Do not throw any trash to the command queue */
+ if (refcount_read(&cur->users) == 0)
+ continue;
+
+ if (!cmds.num)
+ arm_smmu_cmdq_batch_init(smmu, &cmds, &cmd);
+
+ switch (cur->type) {
+ case INV_TYPE_S1_ASID:
+ cmd.tlbi.asid = cur->id;
+ cmd.tlbi.leaf = leaf;
+ arm_smmu_inv_to_cmdq_batch(cur, &cmds, &cmd, iova, size,
+ granule);
+ break;
+ case INV_TYPE_S2_VMID:
+ cmd.tlbi.vmid = cur->id;
+ cmd.tlbi.leaf = leaf;
+ arm_smmu_inv_to_cmdq_batch(cur, &cmds, &cmd, iova, size,
+ granule);
+ break;
+ case INV_TYPE_S2_VMID_S1_CLEAR:
+ /* CMDQ_OP_TLBI_S12_VMALL already flushed S1 entries */
+ if (arm_smmu_inv_size_too_big(cur->smmu, size, granule))
+ continue;
+ /* Just a single CMDQ_OP_TLBI_NH_ALL, no batching */
+ cmd.tlbi.vmid = cur->id;
+ arm_smmu_cmdq_issue_cmd_with_sync(cur->smmu, &cmd);
+ continue;
+ case INV_TYPE_ATS:
+ arm_smmu_atc_inv_to_cmd(cur->ssid, iova, size, &cmd);
+ cmd.atc.sid = cur->id;
+ arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
+ break;
+ case INV_TYPE_ATS_FULL:
+ arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd);
+ cmd.atc.sid = cur->id;
+ arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ continue;
+ }
+
+ if (cmds.num && arm_smmu_invs_end_batch(invs, i))
+ arm_smmu_cmdq_batch_submit(smmu, &cmds);
+ }
+ read_unlock(&invs->rwlock);
+ rcu_read_unlock();
+}
+
static void arm_smmu_tlb_inv_page_nosync(struct iommu_iotlb_gather *gather,
unsigned long iova, size_t granule,
void *cookie)
--
2.43.0
On Wed, Aug 13, 2025 at 06:25:38PM -0700, Nicolin Chen wrote: > +void arm_smmu_domain_inv_range(struct arm_smmu_domain *smmu_domain, > + unsigned long iova, size_t size, > + unsigned int granule, bool leaf) > +{ > + struct arm_smmu_cmdq_batch cmds = {}; > + struct arm_smmu_invs *invs; > + bool retried = false; > + size_t i; > + > + /* > + * An invalidation request must follow some IOPTE change and then load > + * the invalidation array In the meantime, a domain attachment mutates > + * the array and then stores an STE/CD asking SMMU HW to acquire those > + * changed IOPTEs. In other word, these two are interdependent and can > + * race. > + * > + * In a race, the RCU design (with its underlying memory barriers) can > + * ensure the invalidations array to always get updated before loaded. > + * > + * smp_mb() is used here, paired with the smp_mb() following the array > + * update in a concurrent attach, to ensure: > + * - HW sees the new IOPTEs if it walks after STE installation > + * - Invalidation thread sees the updated array with the new ASID. > + * > + * [CPU0] | [CPU1] > + * | > + * change IOPTEs and TLB flush: | > + * arm_smmu_domain_inv_range() { | arm_smmu_install_new_domain_invs { > + * ... | rcu_assign_pointer(new_invs); > + * smp_mb(); // ensure IOPTEs | smp_mb(); // ensure new_invs > + * ... | kfree_rcu(old_invs, rcu); > + * // load invalidation array | } > + * invs = rcu_dereference(); | arm_smmu_install_ste_for_dev { > + * | STE = TTB0 // read new IOPTEs > + */ > + smp_mb(); > + > + rcu_read_lock(); > +again: > + invs = rcu_dereference(smmu_domain->invs); > + > + /* A concurrent attachment might have changed the array. Do a respin */ > + if (unlikely(!read_trylock(&invs->rwlock))) > + goto again; > + /* Only one retry. Otherwise, it would soft lockup on an empty array */ > + if (!retried && unlikely(!invs->num_invs)) { > + read_unlock(&invs->rwlock); > + retried = true; > + goto again; > + } This has missed the point, it was to not get the unless we have ATS. Something like this: rcu_read_lock(); while (true) { invs = rcu_dereference(smmu_domain->invs); /* * Avoid locking unless ATS is being used. No ATS invalidate can * be going on after a domain is detached. */ locked = false; if (invs->has_ats || READ_ONCE(invs->old)) { read_lock(&invs->rwlock); if (invs->old) { read_unlock(&invs->rwlock); continue; } locked = true; } break; } num_invs = READ_ONCE(num_invs); for (i = 0; i != num_invs; i++) { > + case INV_TYPE_S2_VMID_S1_CLEAR: > + /* CMDQ_OP_TLBI_S12_VMALL already flushed S1 entries */ > + if (arm_smmu_inv_size_too_big(cur->smmu, size, granule)) > + continue; > + /* Just a single CMDQ_OP_TLBI_NH_ALL, no batching */ > + cmd.tlbi.vmid = cur->id; > + arm_smmu_cmdq_issue_cmd_with_sync(cur->smmu, &cmd); This should just stick it in the batch, the batch was already flushed: /* The batch for S2 TLBI must be done before nested S1 ASIDs */ if (next->type == INV_TYPE_S2_VMID_S1_CLEAR) return true; That needs a fixup too: /* The batch for S2 TLBI must be done before nested S1 ASIDs */ if (cur->type != INV_TYPE_S2_VMID_S1_CLEAR && next->type == INV_TYPE_S2_VMID_S1_CLEAR) return true; It makes sense to bundle all the NH_ALL into one batch if there is more than one. But this arm_smmu_invs_end_batch() no longer works right since the if (refcount_read(&cur->users) == 0) continue; Was added.. Jason
On Wed, Aug 27, 2025 at 03:49:23PM -0300, Jason Gunthorpe wrote: > On Wed, Aug 13, 2025 at 06:25:38PM -0700, Nicolin Chen wrote: > > +again: > > + invs = rcu_dereference(smmu_domain->invs); > > + > > + /* A concurrent attachment might have changed the array. Do a respin */ > > + if (unlikely(!read_trylock(&invs->rwlock))) > > + goto again; > > + /* Only one retry. Otherwise, it would soft lockup on an empty array */ > > + if (!retried && unlikely(!invs->num_invs)) { > > + read_unlock(&invs->rwlock); > > + retried = true; > > + goto again; > > + } > > This has missed the point, it was to not get the unless we have > ATS. Something like this: I recall one of my earlier local versions put the ATS and blocked conditions on the attach side. So, here it could be unconditional because for most of the time this would be nearly a NOP, until an attachment hits the FLR case. Maybe those conditions got lost during the rework prior to rfcv1, so here this ended up with missing a big thing... > rcu_read_lock(); > > while (true) { > invs = rcu_dereference(smmu_domain->invs); > > /* > * Avoid locking unless ATS is being used. No ATS invalidate can > * be going on after a domain is detached. > */ > locked = false; > if (invs->has_ats || READ_ONCE(invs->old)) { > read_lock(&invs->rwlock); > if (invs->old) { > read_unlock(&invs->rwlock); > continue; > } > locked = true; > } > break; > } I know that performance-wise, this piece will be a quick respin, as the attach side releases the lock very fast. It still looks a bit complicated. And practically, it would respin even if the attachment removes a non-PCI device, right? Applying the condition in the attachment on the other hand will be accurate and simple "if (master->ats_enabled)"? Thanks Nicolin
On Sat, Sep 06, 2025 at 01:12:33AM -0700, Nicolin Chen wrote: > I know that performance-wise, this piece will be a quick respin, > as the attach side releases the lock very fast. It still looks > a bit complicated. And practically, it would respin even if the > attachment removes a non-PCI device, right? If you are paying the cost of taking the lock then it should become fully locked and consistent. Jason
On Mon, Sep 08, 2025 at 12:39:11PM -0300, Jason Gunthorpe wrote: > On Sat, Sep 06, 2025 at 01:12:33AM -0700, Nicolin Chen wrote: > > > I know that performance-wise, this piece will be a quick respin, > > as the attach side releases the lock very fast. It still looks > > a bit complicated. And practically, it would respin even if the > > attachment removes a non-PCI device, right? > > If you are paying the cost of taking the lock then it should become > fully locked and consistent. Well, the point is that the reader doesn't know if an ATS entry is getting removed, and it can only speculate by looking at the full list. So, would it be better to just always take the read lock, while applying the ATS condition to the writer side: [Reader] + /* A concurrent attachment has changed the array. Do a respin */ + if (unlikely(!read_trylock(&invs->rwlock))) + goto again; + if (unlikely(!invs->old)) { + read_unlock(&invs->rwlock); + goto again; + } ... + read_unlock(&invs->rwlock); [Writer] + bool ats_disabled = master->ats_enabled && !state->ats_enabled; ... + if (ats_disabled) + write_lock_irqsave(&old_invs->rwlock, flags); + WRITE_ONCE(old_invs->old, true); + if (ats_disabled) + write_unlock_irqrestore(&old_invs->rwlock, flags); ? Thanks Nicolin
On Mon, Sep 08, 2025 at 11:19:40AM -0700, Nicolin Chen wrote: > On Mon, Sep 08, 2025 at 12:39:11PM -0300, Jason Gunthorpe wrote: > > On Sat, Sep 06, 2025 at 01:12:33AM -0700, Nicolin Chen wrote: > > > > > I know that performance-wise, this piece will be a quick respin, > > > as the attach side releases the lock very fast. It still looks > > > a bit complicated. And practically, it would respin even if the > > > attachment removes a non-PCI device, right? > > > > If you are paying the cost of taking the lock then it should become > > fully locked and consistent. > > Well, the point is that the reader doesn't know if an ATS entry > is getting removed, and it can only speculate by looking at the > full list. It doesn't care. It knows if it has to get a full lock or not, that's it. > So, would it be better to just always take the read lock, while > applying the ATS condition to the writer side: No, the whole optimization is to avoid read side locking on the fairly common no-ATS case. Always taking the lock destroys that. Jsaon
On Mon, Sep 08, 2025 at 03:24:04PM -0300, Jason Gunthorpe wrote: > On Mon, Sep 08, 2025 at 11:19:40AM -0700, Nicolin Chen wrote: > > So, would it be better to just always take the read lock, while > > applying the ATS condition to the writer side: > > No, the whole optimization is to avoid read side locking on the fairly > common no-ATS case. > > Always taking the lock destroys that. OK. I skimmed through the lock_acquire(). It seems to be a bit heavier than I expected. Perhaps this is the part we wanted to avoid. Thanks Nicolin
© 2016 - 2025 Red Hat, Inc.