Update the invs array with the invalidations required by each domain type
during attachment operations.
Only an SVA domain or a paging domain will have an invs array:
a. SVA domain will add an INV_TYPE_S1_ASID per SMMU and an INV_TYPE_ATS
per SID
b. Non-nesting-parent paging domain with no ATS-enabled master will add
a single INV_TYPE_S1_ASID or INV_TYPE_S2_VMID per SMMU
c. Non-nesting-parent paging domain with ATS-enabled master(s) will do
(b) and add an INV_TYPE_ATS per SID
d. Nesting-parent paging domain will add an INV_TYPE_S2_VMID followed by
an INV_TYPE_S2_VMID_S1_CLEAR per vSMMU. For an ATS-enabled master, it
will add an INV_TYPE_ATS_FULL per SID
The per-domain invalidation is not needed, until the domain is attached to
a master, i.e. a possible translation request. Giving this clears a way to
allowing the domain to be attached to many SMMUs, and avoids any pointless
invalidation overheads during a teardown if there are no STE/CDs referring
to the domain. This also means, when the last device is detached, the old
domain must flush its ASID or VMID because any iommu_unmap() call after it
wouldn't initiate any invalidation given an empty domain invs array.
Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
---
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 6 +
drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 220 +++++++++++++++++++-
2 files changed, 225 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 0330444bef45f..715179249eced 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -1084,6 +1084,12 @@ struct arm_smmu_attach_state {
ioasid_t ssid;
/* Resulting state */
struct arm_smmu_vmaster *vmaster;
+ struct arm_smmu_invs **old_domain_invs;
+ struct arm_smmu_invs *old_domain_oinvs;
+ struct arm_smmu_invs *old_domain_ninvs;
+ struct arm_smmu_invs **new_domain_invs;
+ struct arm_smmu_invs *new_domain_oinvs;
+ struct arm_smmu_invs *new_domain_ninvs;
bool ats_enabled;
};
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index fb5429d8ebb29..95615525b0ab8 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3082,6 +3082,76 @@ static void arm_smmu_disable_iopf(struct arm_smmu_master *master,
iopf_queue_remove_device(master->smmu->evtq.iopf, master->dev);
}
+typedef struct arm_smmu_invs *(*invs_fn)(struct arm_smmu_invs *old_invs,
+ struct arm_smmu_invs *invs);
+
+static struct arm_smmu_invs *arm_smmu_build_invs(
+ struct arm_smmu_invs *old_invs, struct arm_smmu_domain *smmu_domain,
+ struct arm_smmu_master *master, bool ats, ioasid_t ssid, invs_fn fn)
+{
+ const bool e2h = master->smmu->features & ARM_SMMU_FEAT_E2H;
+ const bool nesting = smmu_domain->nest_parent;
+ struct arm_smmu_inv *cur = master->invs->inv;
+ size_t num_invs = 1;
+ size_t i;
+
+ switch (smmu_domain->stage) {
+ case ARM_SMMU_DOMAIN_SVA:
+ case ARM_SMMU_DOMAIN_S1:
+ cur->smmu = master->smmu;
+ cur->type = INV_TYPE_S1_ASID;
+ cur->id = smmu_domain->cd.asid;
+ cur->size_opcode = e2h ? CMDQ_OP_TLBI_EL2_VA :
+ CMDQ_OP_TLBI_NH_VA;
+ cur->nsize_opcode = e2h ? CMDQ_OP_TLBI_EL2_ASID :
+ CMDQ_OP_TLBI_NH_ASID;
+ break;
+ case ARM_SMMU_DOMAIN_S2:
+ cur->smmu = master->smmu;
+ cur->type = INV_TYPE_S2_VMID;
+ cur->id = smmu_domain->s2_cfg.vmid;
+ cur->size_opcode = CMDQ_OP_TLBI_S2_IPA;
+ cur->nsize_opcode = CMDQ_OP_TLBI_S12_VMALL;
+ break;
+ default:
+ WARN_ON(true);
+ return old_invs;
+ }
+
+ /* Range-based invalidation requires the leaf pgsize for calculation */
+ if (master->smmu->features & ARM_SMMU_FEAT_RANGE_INV)
+ cur->pgsize = __ffs(smmu_domain->domain.pgsize_bitmap);
+
+ /* All the nested S1 ASIDs have to be flushed when S2 parent changes */
+ if (nesting) {
+ cur = &master->invs->inv[num_invs++];
+ cur->smmu = master->smmu;
+ cur->type = INV_TYPE_S2_VMID_S1_CLEAR;
+ cur->id = smmu_domain->s2_cfg.vmid;
+ cur->size_opcode = CMDQ_OP_TLBI_NH_ALL;
+ cur->nsize_opcode = CMDQ_OP_TLBI_NH_ALL;
+ }
+
+ if (ats) {
+ for (i = 0, cur++; i < master->num_streams; i++) {
+ cur->smmu = master->smmu;
+ /*
+ * If an S2 used as a nesting parent is changed we have
+ * no option but to completely flush the ATC.
+ */
+ cur->type = nesting ? INV_TYPE_ATS_FULL : INV_TYPE_ATS;
+ cur->id = master->streams[i].id;
+ cur->ssid = ssid;
+ cur->size_opcode = CMDQ_OP_ATC_INV;
+ cur->nsize_opcode = CMDQ_OP_ATC_INV;
+ }
+ num_invs += master->num_streams;
+ }
+
+ master->invs->num_invs = num_invs;
+ return fn(old_invs, master->invs);
+}
+
static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
struct iommu_domain *domain,
ioasid_t ssid)
@@ -3111,6 +3181,144 @@ static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
kfree(master_domain);
}
+static int arm_smmu_attach_prepare_invs(struct arm_smmu_attach_state *state,
+ struct arm_smmu_domain *new_smmu_domain)
+{
+ struct arm_smmu_domain *old_smmu_domain =
+ to_smmu_domain_devices(state->old_domain);
+ struct arm_smmu_master *master = state->master;
+ bool blocking = false;
+
+ /* A re-attach case doesn't need to update invs array */
+ if (new_smmu_domain == old_smmu_domain)
+ return 0;
+
+ if (new_smmu_domain) {
+ state->new_domain_oinvs = rcu_dereference_protected(
+ new_smmu_domain->invs,
+ lockdep_is_held(&arm_smmu_asid_lock));
+ state->new_domain_ninvs = arm_smmu_build_invs(
+ state->new_domain_oinvs, new_smmu_domain, master,
+ state->ats_enabled, state->ssid, arm_smmu_invs_add);
+ if (IS_ERR(state->new_domain_ninvs))
+ return PTR_ERR(state->new_domain_ninvs);
+ state->new_domain_invs = &new_smmu_domain->invs;
+ blocking = new_smmu_domain->domain.type == IOMMU_DOMAIN_BLOCKED;
+ }
+
+ if (old_smmu_domain) {
+ state->old_domain_oinvs = rcu_dereference_protected(
+ old_smmu_domain->invs,
+ lockdep_is_held(&arm_smmu_asid_lock));
+ state->old_domain_ninvs = arm_smmu_build_invs(
+ state->old_domain_oinvs, old_smmu_domain, master,
+ master->ats_enabled, state->ssid, arm_smmu_invs_del);
+ if (IS_ERR(state->old_domain_ninvs)) {
+ /* An attachment to the blocked_domain must not fail */
+ if (blocking) {
+ state->old_domain_ninvs = NULL;
+ } else {
+ kfree(state->new_domain_ninvs);
+ return PTR_ERR(state->old_domain_ninvs);
+ }
+ }
+ state->old_domain_invs = &old_smmu_domain->invs;
+ /* master->invs is retaining the del_invs for the old domain */
+ }
+
+ return 0;
+}
+
+/* Must be installed before arm_smmu_install_ste_for_dev() */
+static void
+arm_smmu_install_new_domain_invs(struct arm_smmu_attach_state *state)
+{
+ if (!state->new_domain_invs)
+ return;
+
+ rcu_assign_pointer(*state->new_domain_invs, state->new_domain_ninvs);
+ /*
+ * Committed to updating the STE, using the new invalidation array, and
+ * acquiring any racing IOPTE updates.
+ */
+ smp_mb();
+ kfree_rcu(state->new_domain_oinvs, rcu);
+}
+
+/* Should be installed after arm_smmu_install_ste_for_dev() */
+static void
+arm_smmu_install_old_domain_invs(struct arm_smmu_attach_state *state)
+{
+ struct arm_smmu_invs *old_domain_oinvs = state->old_domain_oinvs;
+ struct arm_smmu_invs *old_domain_ninvs = state->old_domain_ninvs;
+ struct arm_smmu_master *master = state->master;
+ unsigned long flags;
+ size_t num_invs;
+
+ if (!state->old_domain_invs)
+ return;
+
+ /* Activate the no-fail protocol upon an allocation failure */
+ if (!old_domain_ninvs) {
+ /*
+ * Notes:
+ * - The array will be edited in place while holding its rwlock
+ * which has a tradeoff that any concurrent invalidation will
+ * fail at read_trylock() until arm_smmu_invs_dec() returns.
+ * - arm_smmu_invs_dec() doesn't update the array's num_invs as
+ * if only decrease users counters. So, get num_invs from the
+ * returned value.
+ * - The master->invs retains the del_invs for the old domain.
+ */
+ num_invs = arm_smmu_invs_dec(old_domain_oinvs, master->invs);
+ } else {
+ rcu_assign_pointer(*state->old_domain_invs, old_domain_ninvs);
+ /*
+ * Fake an empty old array that a concurrent invalidation thread
+ * races at. It either lets the reader quickly respin for a new
+ * array with fewer num_invs (avoiding deleted invalidations) or
+ * blocks the writer till the reader flushes the array (avoiding
+ * ATC invalidation timeouts for ATS invalidations being sent to
+ * a resetting PCI device).
+ */
+ write_lock_irqsave(&old_domain_oinvs->rwlock, flags);
+ old_domain_oinvs->num_invs = 0;
+ write_unlock_irqrestore(&old_domain_oinvs->rwlock, flags);
+
+ kfree_rcu(old_domain_oinvs, rcu);
+ num_invs = state->old_domain_ninvs->num_invs;
+ }
+
+ /*
+ * The domain invs array was filled when the first device attaches to it
+ * and emptied when the last device detaches. So, the invs array doesn't
+ * syncrhonize with iommu_unmap() calls, which might come after the last
+ * detach and end up with a NOP. This would result in missing a critical
+ * TLB maintanance. Thus, when the last device is detached (indicated by
+ * an empty invs array), flush all TLBs using the removed ASID or VMID.
+ */
+ if (!num_invs) {
+ struct arm_smmu_inv *inv = &master->invs->inv[0];
+ struct arm_smmu_cmdq_ent cmd = {
+ .opcode = inv->nsize_opcode,
+ };
+
+ switch (inv->type) {
+ case INV_TYPE_S1_ASID:
+ cmd.tlbi.asid = inv->id;
+ arm_smmu_cmdq_issue_cmd_with_sync(inv->smmu, &cmd);
+ break;
+ case INV_TYPE_S2_VMID:
+ cmd.tlbi.vmid = inv->id;
+ arm_smmu_cmdq_issue_cmd_with_sync(inv->smmu, &cmd);
+ break;
+ default:
+ WARN_ON(true);
+ break;
+ }
+ }
+}
+
/*
* Start the sequence to attach a domain to a master. The sequence contains three
* steps:
@@ -3168,12 +3376,16 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
arm_smmu_ats_supported(master);
}
+ ret = arm_smmu_attach_prepare_invs(state, smmu_domain);
+ if (ret)
+ return ret;
+
if (smmu_domain) {
if (new_domain->type == IOMMU_DOMAIN_NESTED) {
ret = arm_smmu_attach_prepare_vmaster(
state, to_smmu_nested_domain(new_domain));
if (ret)
- return ret;
+ goto err_unprepare_invs;
}
master_domain = kzalloc(sizeof(*master_domain), GFP_KERNEL);
@@ -3221,6 +3433,8 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
atomic_inc(&smmu_domain->nr_ats_masters);
list_add(&master_domain->devices_elm, &smmu_domain->devices);
spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+
+ arm_smmu_install_new_domain_invs(state);
}
if (!state->ats_enabled && master->ats_enabled) {
@@ -3240,6 +3454,9 @@ int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
kfree(master_domain);
err_free_vmaster:
kfree(state->vmaster);
+err_unprepare_invs:
+ kfree(state->old_domain_ninvs);
+ kfree(state->new_domain_ninvs);
return ret;
}
@@ -3271,6 +3488,7 @@ void arm_smmu_attach_commit(struct arm_smmu_attach_state *state)
}
arm_smmu_remove_master_domain(master, state->old_domain, state->ssid);
+ arm_smmu_install_old_domain_invs(state);
master->ats_enabled = state->ats_enabled;
}
--
2.43.0
On Wed, Aug 13, 2025 at 06:25:37PM -0700, Nicolin Chen wrote: > +typedef struct arm_smmu_invs *(*invs_fn)(struct arm_smmu_invs *old_invs, > + struct arm_smmu_invs *invs); no reason to pass in fn, this always just calls it as the last thing so the caller can do it.. > +static struct arm_smmu_invs *arm_smmu_build_invs( > + struct arm_smmu_invs *old_invs, struct arm_smmu_domain *smmu_domain, > + struct arm_smmu_master *master, bool ats, ioasid_t ssid, invs_fn fn) > +{ > + const bool e2h = master->smmu->features & ARM_SMMU_FEAT_E2H; > + const bool nesting = smmu_domain->nest_parent; > + struct arm_smmu_inv *cur = master->invs->inv; > + size_t num_invs = 1; > + size_t i; > + > + switch (smmu_domain->stage) { > + case ARM_SMMU_DOMAIN_SVA: > + case ARM_SMMU_DOMAIN_S1: > + cur->smmu = master->smmu; > + cur->type = INV_TYPE_S1_ASID; > + cur->id = smmu_domain->cd.asid; > + cur->size_opcode = e2h ? CMDQ_OP_TLBI_EL2_VA : > + CMDQ_OP_TLBI_NH_VA; > + cur->nsize_opcode = e2h ? CMDQ_OP_TLBI_EL2_ASID : > + CMDQ_OP_TLBI_NH_ASID; > + break; > + case ARM_SMMU_DOMAIN_S2: > + cur->smmu = master->smmu; > + cur->type = INV_TYPE_S2_VMID; > + cur->id = smmu_domain->s2_cfg.vmid; > + cur->size_opcode = CMDQ_OP_TLBI_S2_IPA; > + cur->nsize_opcode = CMDQ_OP_TLBI_S12_VMALL; > + break; > + default: > + WARN_ON(true); > + return old_invs; Return ERR_PTR, it makes the error flows possibly wrong or at least over complex to return something that shouldn't be freed. > + } > + > + /* Range-based invalidation requires the leaf pgsize for calculation */ > + if (master->smmu->features & ARM_SMMU_FEAT_RANGE_INV) > + cur->pgsize = __ffs(smmu_domain->domain.pgsize_bitmap); > + > + /* All the nested S1 ASIDs have to be flushed when S2 parent changes */ > + if (nesting) { > + cur = &master->invs->inv[num_invs++]; Don't do both 'cur as an iterator' and 'num_invs as the location'. Delete num_invs entirely and just use cur. > + cur->smmu = master->smmu; > + cur->type = INV_TYPE_S2_VMID_S1_CLEAR; > + cur->id = smmu_domain->s2_cfg.vmid; > + cur->size_opcode = CMDQ_OP_TLBI_NH_ALL; > + cur->nsize_opcode = CMDQ_OP_TLBI_NH_ALL; > + } > + > + if (ats) { > + for (i = 0, cur++; i < master->num_streams; i++) { > + cur->smmu = master->smmu; > + /* > + * If an S2 used as a nesting parent is changed we have > + * no option but to completely flush the ATC. > + */ > + cur->type = nesting ? INV_TYPE_ATS_FULL : INV_TYPE_ATS; > + cur->id = master->streams[i].id; > + cur->ssid = ssid; > + cur->size_opcode = CMDQ_OP_ATC_INV; > + cur->nsize_opcode = CMDQ_OP_ATC_INV; > + } > + num_invs += master->num_streams; > + } > + > + master->invs->num_invs = num_invs; Like this: master->invs->num_invs = cur - master->invs->inv; > +static int arm_smmu_attach_prepare_invs(struct arm_smmu_attach_state *state, > + struct arm_smmu_domain *new_smmu_domain) > +{ How about a comment: /* * During attachment the invalidation lists on the two domains are sequenced: * 1. old domain is invalidating master * 2. new and old domain are invalidating master * 3. new domain is invalidating master * * This uses two updated invalidation lists, one with master added to new domain * and one with master removed from old domain. Prepare these lists in advance * of changing anything. arm_smmu_asid_lock ensures that the invalidation list * in the domains doesn't change while we are sequencing to update it. */ > + struct arm_smmu_domain *old_smmu_domain = > + to_smmu_domain_devices(state->old_domain); > + struct arm_smmu_master *master = state->master; > + bool blocking = false; > + > + /* A re-attach case doesn't need to update invs array */ > + if (new_smmu_domain == old_smmu_domain) > + return 0; > + > + if (new_smmu_domain) { This if wants a comment, it is tricky: /* * At this point a NULL domain indicates the domain doesn't use the * IOTLB, see to_smmu_domain_devices(). */ > + state->new_domain_oinvs = rcu_dereference_protected( > + new_smmu_domain->invs, > + lockdep_is_held(&arm_smmu_asid_lock)); > + state->new_domain_ninvs = arm_smmu_build_invs( > + state->new_domain_oinvs, new_smmu_domain, master, > + state->ats_enabled, state->ssid, arm_smmu_invs_add); > + if (IS_ERR(state->new_domain_ninvs)) > + return PTR_ERR(state->new_domain_ninvs); > + state->new_domain_invs = &new_smmu_domain->invs; > + blocking = new_smmu_domain->domain.type == IOMMU_DOMAIN_BLOCKED; > + } > + > + if (old_smmu_domain) { > + state->old_domain_oinvs = rcu_dereference_protected( > + old_smmu_domain->invs, > + lockdep_is_held(&arm_smmu_asid_lock)); > + state->old_domain_ninvs = arm_smmu_build_invs( > + state->old_domain_oinvs, old_smmu_domain, master, > + master->ats_enabled, state->ssid, arm_smmu_invs_del); > + if (IS_ERR(state->old_domain_ninvs)) { Then here, as per the last email, just get rid of invs_del and use the scratch list master->invs for the next step. So all this goes away: Jason
On Wed, Aug 27, 2025 at 03:21:23PM -0300, Jason Gunthorpe wrote: > On Wed, Aug 13, 2025 at 06:25:37PM -0700, Nicolin Chen wrote: > > +typedef struct arm_smmu_invs *(*invs_fn)(struct arm_smmu_invs *old_invs, > > + struct arm_smmu_invs *invs); > > no reason to pass in fn, this always just calls it as the last thing > so the caller can do it.. The only concern from letting callers to invoke separately is that the master->build_invs is a shared memory. So, embedding it inside this arm_smmu_build_invs() allows it to be used before overwritten. Having said that, I think we should be fine, as I noted this down in the kdocs. Thanks Nicolin > > +static struct arm_smmu_invs *arm_smmu_build_invs( > > + struct arm_smmu_invs *old_invs, struct arm_smmu_domain *smmu_domain, > > + struct arm_smmu_master *master, bool ats, ioasid_t ssid, invs_fn fn)
On Wed, Aug 27, 2025 at 03:21:23PM -0300, Jason Gunthorpe wrote: > On Wed, Aug 13, 2025 at 06:25:37PM -0700, Nicolin Chen wrote: > > +static int arm_smmu_attach_prepare_invs(struct arm_smmu_attach_state *state, > > + struct arm_smmu_domain *new_smmu_domain) > > +{ > > How about a comment: > > /* > * During attachment the invalidation lists on the two domains are sequenced: > * 1. old domain is invalidating master > * 2. new and old domain are invalidating master > * 3. new domain is invalidating master > * > * This uses two updated invalidation lists, one with master added to new domain > * and one with master removed from old domain. Prepare these lists in advance > * of changing anything. arm_smmu_asid_lock ensures that the invalidation list > * in the domains doesn't change while we are sequencing to update it. > */ Having addressed the other places following your remarks, I had a little trouble to understand that 1-2-3. But I think that can be elaborated with: +/* + * During attachment, the updates of the two domain->invs arrays are sequenced: + * 1. new domain updates its invs array, merging master->build_invs + * 2. new domain starts to include the master during its invalidation + * 3. master updates its STE switching from the old domain to the new domain + * 4. old domain still includes the master during its invalidation + * 5. old domain updates its invs array, unreferencing master->build_invs + * + * For 1 and 5, prepare the two updated arrays in advance, handling any changes + * that can possibly failure. So the actual update of either 1 or 5 won't fail. + * arm_smmu_asid_lock ensures that the old invs in the domains are intact while + * we are sequencing to update them. + */ Thanks Nicolin
© 2016 - 2025 Red Hat, Inc.