The Intel VT-d PASID table entry is 512 bits (64 bytes). Because the
hardware may fetch this entry in multiple 128-bit chunks, updating the
entire entry while it is active (P=1) risks a "torn" read where the
hardware observes an inconsistent state.
However, certain updates (e.g., changing page table pointers while
keeping the translation type and domain ID the same) can be performed
hitlessly. This is possible if the update is limited to a single
128-bit chunk while the other chunks remains stable.
Introduce a hitless replacement mechanism for PASID entries:
- Update 'struct pasid_entry' with a union to support 128-bit
access via the newly added val128[4] array.
- Add pasid_support_hitless_replace() to determine if a transition
between an old and new entry is safe to perform atomically.
- For First-level/Nested translations: The first 128 bits (chunk 0)
must remain identical; chunk 1 is updated atomically.
- For Second-level/Pass-through: The second 128 bits (chunk 1)
must remain identical; chunk 0 is updated atomically.
- If hitless replacement is supported, use intel_iommu_atomic128_set()
to commit the change in a single 16-byte burst.
- If the changes are too extensive to be hitless, fall back to the
safe "tear down and re-setup" flow (clear present -> flush -> setup).
Fixes: 7543ee63e811 ("iommu/vt-d: Add pasid replace helpers")
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
drivers/iommu/intel/pasid.h | 26 ++++++++++++++++-
drivers/iommu/intel/pasid.c | 57 ++++++++++++++++++++++++++++++++++---
2 files changed, 78 insertions(+), 5 deletions(-)
diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
index 35de1d77355f..b569e2828a8b 100644
--- a/drivers/iommu/intel/pasid.h
+++ b/drivers/iommu/intel/pasid.h
@@ -37,7 +37,10 @@ struct pasid_dir_entry {
};
struct pasid_entry {
- u64 val[8];
+ union {
+ u64 val[8];
+ u128 val128[4];
+ };
};
#define PASID_ENTRY_PGTT_FL_ONLY (1)
@@ -297,6 +300,27 @@ static inline void pasid_set_eafe(struct pasid_entry *pe)
pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
}
+static inline bool pasid_support_hitless_replace(struct pasid_entry *pte,
+ struct pasid_entry *new, int type)
+{
+ switch (type) {
+ case PASID_ENTRY_PGTT_FL_ONLY:
+ case PASID_ENTRY_PGTT_NESTED:
+ /* The first 128 bits remain the same. */
+ return READ_ONCE(pte->val[0]) == READ_ONCE(new->val[0]) &&
+ READ_ONCE(pte->val[1]) == READ_ONCE(new->val[1]);
+ case PASID_ENTRY_PGTT_SL_ONLY:
+ case PASID_ENTRY_PGTT_PT:
+ /* The second 128 bits remain the same. */
+ return READ_ONCE(pte->val[2]) == READ_ONCE(new->val[2]) &&
+ READ_ONCE(pte->val[3]) == READ_ONCE(new->val[3]);
+ default:
+ WARN_ON(true);
+ }
+
+ return false;
+}
+
extern unsigned int intel_pasid_max_id;
int intel_pasid_alloc_table(struct device *dev);
void intel_pasid_free_table(struct device *dev);
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 4f36138448d8..da7ab18d3bfe 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -452,7 +452,20 @@ int intel_pasid_replace_first_level(struct intel_iommu *iommu,
WARN_ON(old_did != pasid_get_domain_id(pte));
- *pte = new_pte;
+ if (!pasid_support_hitless_replace(pte, &new_pte,
+ PASID_ENTRY_PGTT_FL_ONLY)) {
+ spin_unlock(&iommu->lock);
+ intel_pasid_tear_down_entry(iommu, dev, pasid, false);
+
+ return intel_pasid_setup_first_level(iommu, dev, fsptptr,
+ pasid, did, flags);
+ }
+
+ /*
+ * A first-only hitless replace requires the first 128 bits to remain
+ * the same. Only the second 128-bit chunk needs to be updated.
+ */
+ intel_iommu_atomic128_set(&pte->val128[1], new_pte.val128[1]);
spin_unlock(&iommu->lock);
intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
@@ -563,7 +576,19 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu,
WARN_ON(old_did != pasid_get_domain_id(pte));
- *pte = new_pte;
+ if (!pasid_support_hitless_replace(pte, &new_pte,
+ PASID_ENTRY_PGTT_SL_ONLY)) {
+ spin_unlock(&iommu->lock);
+ intel_pasid_tear_down_entry(iommu, dev, pasid, false);
+
+ return intel_pasid_setup_second_level(iommu, domain, dev, pasid);
+ }
+
+ /*
+ * A second-only hitless replace requires the second 128 bits to remain
+ * the same. Only the first 128-bit chunk needs to be updated.
+ */
+ intel_iommu_atomic128_set(&pte->val128[0], new_pte.val128[0]);
spin_unlock(&iommu->lock);
intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
@@ -707,7 +732,19 @@ int intel_pasid_replace_pass_through(struct intel_iommu *iommu,
WARN_ON(old_did != pasid_get_domain_id(pte));
- *pte = new_pte;
+ if (!pasid_support_hitless_replace(pte, &new_pte,
+ PASID_ENTRY_PGTT_PT)) {
+ spin_unlock(&iommu->lock);
+ intel_pasid_tear_down_entry(iommu, dev, pasid, false);
+
+ return intel_pasid_setup_pass_through(iommu, dev, pasid);
+ }
+
+ /*
+ * A passthrough hitless replace requires the second 128 bits to remain
+ * the same. Only the first 128-bit chunk needs to be updated.
+ */
+ intel_iommu_atomic128_set(&pte->val128[0], new_pte.val128[0]);
spin_unlock(&iommu->lock);
intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
@@ -903,7 +940,19 @@ int intel_pasid_replace_nested(struct intel_iommu *iommu,
WARN_ON(old_did != pasid_get_domain_id(pte));
- *pte = new_pte;
+ if (!pasid_support_hitless_replace(pte, &new_pte,
+ PASID_ENTRY_PGTT_NESTED)) {
+ spin_unlock(&iommu->lock);
+ intel_pasid_tear_down_entry(iommu, dev, pasid, false);
+
+ return intel_pasid_setup_nested(iommu, dev, pasid, domain);
+ }
+
+ /*
+ * A nested hitless replace requires the first 128 bits to remain
+ * the same. Only the second 128-bit chunk needs to be updated.
+ */
+ intel_iommu_atomic128_set(&pte->val128[1], new_pte.val128[1]);
spin_unlock(&iommu->lock);
intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
--
2.43.0
On Tue, Jan 13, 2026 at 11:00:48AM +0800, Lu Baolu wrote:
> The Intel VT-d PASID table entry is 512 bits (64 bytes). Because the
> hardware may fetch this entry in multiple 128-bit chunks, updating the
> entire entry while it is active (P=1) risks a "torn" read where the
> hardware observes an inconsistent state.
>
> However, certain updates (e.g., changing page table pointers while
> keeping the translation type and domain ID the same) can be performed
> hitlessly. This is possible if the update is limited to a single
> 128-bit chunk while the other chunks remains stable.
>
> Introduce a hitless replacement mechanism for PASID entries:
>
> - Update 'struct pasid_entry' with a union to support 128-bit
> access via the newly added val128[4] array.
> - Add pasid_support_hitless_replace() to determine if a transition
> between an old and new entry is safe to perform atomically.
> - For First-level/Nested translations: The first 128 bits (chunk 0)
> must remain identical; chunk 1 is updated atomically.
> - For Second-level/Pass-through: The second 128 bits (chunk 1)
> must remain identical; chunk 0 is updated atomically.
> - If hitless replacement is supported, use intel_iommu_atomic128_set()
> to commit the change in a single 16-byte burst.
> - If the changes are too extensive to be hitless, fall back to the
> safe "tear down and re-setup" flow (clear present -> flush -> setup).
>
> Fixes: 7543ee63e811 ("iommu/vt-d: Add pasid replace helpers")
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> ---
> drivers/iommu/intel/pasid.h | 26 ++++++++++++++++-
> drivers/iommu/intel/pasid.c | 57 ++++++++++++++++++++++++++++++++++---
> 2 files changed, 78 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
> index 35de1d77355f..b569e2828a8b 100644
> --- a/drivers/iommu/intel/pasid.h
> +++ b/drivers/iommu/intel/pasid.h
> @@ -37,7 +37,10 @@ struct pasid_dir_entry {
> };
>
> struct pasid_entry {
> - u64 val[8];
> + union {
> + u64 val[8];
> + u128 val128[4];
> + };
> };
>
> #define PASID_ENTRY_PGTT_FL_ONLY (1)
> @@ -297,6 +300,27 @@ static inline void pasid_set_eafe(struct pasid_entry *pe)
> pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
> }
>
> +static inline bool pasid_support_hitless_replace(struct pasid_entry *pte,
> + struct pasid_entry *new, int type)
> +{
> + switch (type) {
> + case PASID_ENTRY_PGTT_FL_ONLY:
> + case PASID_ENTRY_PGTT_NESTED:
> + /* The first 128 bits remain the same. */
> + return READ_ONCE(pte->val[0]) == READ_ONCE(new->val[0]) &&
> + READ_ONCE(pte->val[1]) == READ_ONCE(new->val[1]);
> + case PASID_ENTRY_PGTT_SL_ONLY:
> + case PASID_ENTRY_PGTT_PT:
> + /* The second 128 bits remain the same. */
> + return READ_ONCE(pte->val[2]) == READ_ONCE(new->val[2]) &&
> + READ_ONCE(pte->val[3]) == READ_ONCE(new->val[3]);
> + default:
> + WARN_ON(true);
nit: WARN_ON(false) seems a bit more suitable?
> + }
> +
> + return false;
> +}
> +
> extern unsigned int intel_pasid_max_id;
> int intel_pasid_alloc_table(struct device *dev);
> void intel_pasid_free_table(struct device *dev);
> diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
> index 4f36138448d8..da7ab18d3bfe 100644
> --- a/drivers/iommu/intel/pasid.c
> +++ b/drivers/iommu/intel/pasid.c
> @@ -452,7 +452,20 @@ int intel_pasid_replace_first_level(struct intel_iommu *iommu,
>
> WARN_ON(old_did != pasid_get_domain_id(pte));
>
> - *pte = new_pte;
> + if (!pasid_support_hitless_replace(pte, &new_pte,
> + PASID_ENTRY_PGTT_FL_ONLY)) {
> + spin_unlock(&iommu->lock);
> + intel_pasid_tear_down_entry(iommu, dev, pasid, false);
> +
> + return intel_pasid_setup_first_level(iommu, dev, fsptptr,
> + pasid, did, flags);
> + }
> +
> + /*
> + * A first-only hitless replace requires the first 128 bits to remain
> + * the same. Only the second 128-bit chunk needs to be updated.
> + */
> + intel_iommu_atomic128_set(&pte->val128[1], new_pte.val128[1]);
> spin_unlock(&iommu->lock);
>
> intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
> @@ -563,7 +576,19 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu,
>
> WARN_ON(old_did != pasid_get_domain_id(pte));
>
> - *pte = new_pte;
> + if (!pasid_support_hitless_replace(pte, &new_pte,
> + PASID_ENTRY_PGTT_SL_ONLY)) {
> + spin_unlock(&iommu->lock);
> + intel_pasid_tear_down_entry(iommu, dev, pasid, false);
> +
> + return intel_pasid_setup_second_level(iommu, domain, dev, pasid);
> + }
> +
> + /*
> + * A second-only hitless replace requires the second 128 bits to remain
> + * the same. Only the first 128-bit chunk needs to be updated.
> + */
> + intel_iommu_atomic128_set(&pte->val128[0], new_pte.val128[0]);
> spin_unlock(&iommu->lock);
>
> intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
> @@ -707,7 +732,19 @@ int intel_pasid_replace_pass_through(struct intel_iommu *iommu,
>
> WARN_ON(old_did != pasid_get_domain_id(pte));
>
> - *pte = new_pte;
> + if (!pasid_support_hitless_replace(pte, &new_pte,
> + PASID_ENTRY_PGTT_PT)) {
> + spin_unlock(&iommu->lock);
> + intel_pasid_tear_down_entry(iommu, dev, pasid, false);
> +
> + return intel_pasid_setup_pass_through(iommu, dev, pasid);
> + }
> +
> + /*
> + * A passthrough hitless replace requires the second 128 bits to remain
> + * the same. Only the first 128-bit chunk needs to be updated.
> + */
> + intel_iommu_atomic128_set(&pte->val128[0], new_pte.val128[0]);
> spin_unlock(&iommu->lock);
>
> intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
> @@ -903,7 +940,19 @@ int intel_pasid_replace_nested(struct intel_iommu *iommu,
>
> WARN_ON(old_did != pasid_get_domain_id(pte));
>
> - *pte = new_pte;
> + if (!pasid_support_hitless_replace(pte, &new_pte,
> + PASID_ENTRY_PGTT_NESTED)) {
> + spin_unlock(&iommu->lock);
> + intel_pasid_tear_down_entry(iommu, dev, pasid, false);
> +
> + return intel_pasid_setup_nested(iommu, dev, pasid, domain);
> + }
> +
> + /*
> + * A nested hitless replace requires the first 128 bits to remain
> + * the same. Only the second 128-bit chunk needs to be updated.
> + */
> + intel_iommu_atomic128_set(&pte->val128[1], new_pte.val128[1]);
> spin_unlock(&iommu->lock);
>
> intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
> --
> 2.43.0
>
On Tue, Jan 13, 2026 at 08:40:00PM +0100, Dmytro Maluka wrote: > On Tue, Jan 13, 2026 at 11:00:48AM +0800, Lu Baolu wrote: ... > > + default: > > + WARN_ON(true); > > nit: WARN_ON(false) seems a bit more suitable? Ah sorry, ignore this. Somehow I thought this was returning true...
On Mon, Jan 12, 2026 at 7:03 PM Lu Baolu <baolu.lu@linux.intel.com> wrote:
>
> The Intel VT-d PASID table entry is 512 bits (64 bytes). Because the
> hardware may fetch this entry in multiple 128-bit chunks, updating the
> entire entry while it is active (P=1) risks a "torn" read where the
> hardware observes an inconsistent state.
>
> However, certain updates (e.g., changing page table pointers while
> keeping the translation type and domain ID the same) can be performed
> hitlessly. This is possible if the update is limited to a single
> 128-bit chunk while the other chunks remains stable.
>
> Introduce a hitless replacement mechanism for PASID entries:
>
> - Update 'struct pasid_entry' with a union to support 128-bit
> access via the newly added val128[4] array.
> - Add pasid_support_hitless_replace() to determine if a transition
> between an old and new entry is safe to perform atomically.
> - For First-level/Nested translations: The first 128 bits (chunk 0)
> must remain identical; chunk 1 is updated atomically.
Looking at the specs, the DID is part of the first 128 bits (chunk 0),
so I guess for the first level the hitless replacement would not be
supported since each domain will have a different DID?
> - For Second-level/Pass-through: The second 128 bits (chunk 1)
> must remain identical; chunk 0 is updated atomically.
> - If hitless replacement is supported, use intel_iommu_atomic128_set()
> to commit the change in a single 16-byte burst.
> - If the changes are too extensive to be hitless, fall back to the
> safe "tear down and re-setup" flow (clear present -> flush -> setup).
>
> Fixes: 7543ee63e811 ("iommu/vt-d: Add pasid replace helpers")
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> ---
> drivers/iommu/intel/pasid.h | 26 ++++++++++++++++-
> drivers/iommu/intel/pasid.c | 57 ++++++++++++++++++++++++++++++++++---
> 2 files changed, 78 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h
> index 35de1d77355f..b569e2828a8b 100644
> --- a/drivers/iommu/intel/pasid.h
> +++ b/drivers/iommu/intel/pasid.h
> @@ -37,7 +37,10 @@ struct pasid_dir_entry {
> };
>
> struct pasid_entry {
> - u64 val[8];
> + union {
> + u64 val[8];
> + u128 val128[4];
> + };
> };
>
> #define PASID_ENTRY_PGTT_FL_ONLY (1)
> @@ -297,6 +300,27 @@ static inline void pasid_set_eafe(struct pasid_entry *pe)
> pasid_set_bits(&pe->val[2], 1 << 7, 1 << 7);
> }
>
> +static inline bool pasid_support_hitless_replace(struct pasid_entry *pte,
> + struct pasid_entry *new, int type)
> +{
> + switch (type) {
> + case PASID_ENTRY_PGTT_FL_ONLY:
> + case PASID_ENTRY_PGTT_NESTED:
> + /* The first 128 bits remain the same. */
> + return READ_ONCE(pte->val[0]) == READ_ONCE(new->val[0]) &&
> + READ_ONCE(pte->val[1]) == READ_ONCE(new->val[1]);
> + case PASID_ENTRY_PGTT_SL_ONLY:
> + case PASID_ENTRY_PGTT_PT:
> + /* The second 128 bits remain the same. */
> + return READ_ONCE(pte->val[2]) == READ_ONCE(new->val[2]) &&
> + READ_ONCE(pte->val[3]) == READ_ONCE(new->val[3]);
> + default:
> + WARN_ON(true);
> + }
> +
> + return false;
> +}
> +
> extern unsigned int intel_pasid_max_id;
> int intel_pasid_alloc_table(struct device *dev);
> void intel_pasid_free_table(struct device *dev);
> diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
> index 4f36138448d8..da7ab18d3bfe 100644
> --- a/drivers/iommu/intel/pasid.c
> +++ b/drivers/iommu/intel/pasid.c
> @@ -452,7 +452,20 @@ int intel_pasid_replace_first_level(struct intel_iommu *iommu,
>
> WARN_ON(old_did != pasid_get_domain_id(pte));
>
> - *pte = new_pte;
> + if (!pasid_support_hitless_replace(pte, &new_pte,
> + PASID_ENTRY_PGTT_FL_ONLY)) {
> + spin_unlock(&iommu->lock);
> + intel_pasid_tear_down_entry(iommu, dev, pasid, false);
> +
> + return intel_pasid_setup_first_level(iommu, dev, fsptptr,
> + pasid, did, flags);
> + }
> +
> + /*
> + * A first-only hitless replace requires the first 128 bits to remain
> + * the same. Only the second 128-bit chunk needs to be updated.
> + */
> + intel_iommu_atomic128_set(&pte->val128[1], new_pte.val128[1]);
> spin_unlock(&iommu->lock);
>
> intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
> @@ -563,7 +576,19 @@ int intel_pasid_replace_second_level(struct intel_iommu *iommu,
>
> WARN_ON(old_did != pasid_get_domain_id(pte));
>
> - *pte = new_pte;
> + if (!pasid_support_hitless_replace(pte, &new_pte,
> + PASID_ENTRY_PGTT_SL_ONLY)) {
> + spin_unlock(&iommu->lock);
> + intel_pasid_tear_down_entry(iommu, dev, pasid, false);
> +
> + return intel_pasid_setup_second_level(iommu, domain, dev, pasid);
> + }
> +
> + /*
> + * A second-only hitless replace requires the second 128 bits to remain
> + * the same. Only the first 128-bit chunk needs to be updated.
> + */
> + intel_iommu_atomic128_set(&pte->val128[0], new_pte.val128[0]);
> spin_unlock(&iommu->lock);
>
> intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
> @@ -707,7 +732,19 @@ int intel_pasid_replace_pass_through(struct intel_iommu *iommu,
>
> WARN_ON(old_did != pasid_get_domain_id(pte));
>
> - *pte = new_pte;
> + if (!pasid_support_hitless_replace(pte, &new_pte,
> + PASID_ENTRY_PGTT_PT)) {
> + spin_unlock(&iommu->lock);
> + intel_pasid_tear_down_entry(iommu, dev, pasid, false);
> +
> + return intel_pasid_setup_pass_through(iommu, dev, pasid);
> + }
> +
> + /*
> + * A passthrough hitless replace requires the second 128 bits to remain
> + * the same. Only the first 128-bit chunk needs to be updated.
> + */
> + intel_iommu_atomic128_set(&pte->val128[0], new_pte.val128[0]);
> spin_unlock(&iommu->lock);
>
> intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
> @@ -903,7 +940,19 @@ int intel_pasid_replace_nested(struct intel_iommu *iommu,
>
> WARN_ON(old_did != pasid_get_domain_id(pte));
>
> - *pte = new_pte;
> + if (!pasid_support_hitless_replace(pte, &new_pte,
> + PASID_ENTRY_PGTT_NESTED)) {
> + spin_unlock(&iommu->lock);
> + intel_pasid_tear_down_entry(iommu, dev, pasid, false);
> +
> + return intel_pasid_setup_nested(iommu, dev, pasid, domain);
> + }
> +
> + /*
> + * A nested hitless replace requires the first 128 bits to remain
> + * the same. Only the second 128-bit chunk needs to be updated.
> + */
> + intel_iommu_atomic128_set(&pte->val128[1], new_pte.val128[1]);
> spin_unlock(&iommu->lock);
>
> intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
> --
> 2.43.0
>
On 1/14/26 03:27, Samiullah Khawaja wrote: > On Mon, Jan 12, 2026 at 7:03 PM Lu Baolu<baolu.lu@linux.intel.com> wrote: >> The Intel VT-d PASID table entry is 512 bits (64 bytes). Because the >> hardware may fetch this entry in multiple 128-bit chunks, updating the >> entire entry while it is active (P=1) risks a "torn" read where the >> hardware observes an inconsistent state. >> >> However, certain updates (e.g., changing page table pointers while >> keeping the translation type and domain ID the same) can be performed >> hitlessly. This is possible if the update is limited to a single >> 128-bit chunk while the other chunks remains stable. >> >> Introduce a hitless replacement mechanism for PASID entries: >> >> - Update 'struct pasid_entry' with a union to support 128-bit >> access via the newly added val128[4] array. >> - Add pasid_support_hitless_replace() to determine if a transition >> between an old and new entry is safe to perform atomically. >> - For First-level/Nested translations: The first 128 bits (chunk 0) >> must remain identical; chunk 1 is updated atomically. > Looking at the specs, the DID is part of the first 128 bits (chunk 0), > so I guess for the first level the hitless replacement would not be > supported since each domain will have a different DID? It's not necessarily true that each domain will have a different DID. On Intel IOMMU, all SVA domains can share a single DID. Similarly, multiple nested domains sitting on top of the same second-stage page table can also share a DID. Thanks, baolu
> From: Baolu Lu <baolu.lu@linux.intel.com> > Sent: Wednesday, January 14, 2026 1:45 PM > > On 1/14/26 03:27, Samiullah Khawaja wrote: > > On Mon, Jan 12, 2026 at 7:03 PM Lu Baolu<baolu.lu@linux.intel.com> > wrote: > >> The Intel VT-d PASID table entry is 512 bits (64 bytes). Because the > >> hardware may fetch this entry in multiple 128-bit chunks, updating the > >> entire entry while it is active (P=1) risks a "torn" read where the > >> hardware observes an inconsistent state. > >> > >> However, certain updates (e.g., changing page table pointers while > >> keeping the translation type and domain ID the same) can be performed > >> hitlessly. This is possible if the update is limited to a single > >> 128-bit chunk while the other chunks remains stable. > >> > >> Introduce a hitless replacement mechanism for PASID entries: > >> > >> - Update 'struct pasid_entry' with a union to support 128-bit > >> access via the newly added val128[4] array. > >> - Add pasid_support_hitless_replace() to determine if a transition > >> between an old and new entry is safe to perform atomically. > >> - For First-level/Nested translations: The first 128 bits (chunk 0) > >> must remain identical; chunk 1 is updated atomically. > > Looking at the specs, the DID is part of the first 128 bits (chunk 0), > > so I guess for the first level the hitless replacement would not be > > supported since each domain will have a different DID? > > It's not necessarily true that each domain will have a different DID. On > Intel IOMMU, all SVA domains can share a single DID. Similarly, multiple > nested domains sitting on top of the same second-stage page table can > also share a DID. > I guess Samiullah talked about DMA domain with first stage, where each DMA domain has a DID. The spec says that DID and pgtable pointer must be updated in one atomic operation. It applies to second-stage but not first-stage which sits in a different chunk from where DID sits. But thinking more I'm not sure whether that guidance is too strict. The key requirement is below: When modifying fields in present (P=1) entries, software must ensure that at any point of time during the modification (performed through single or multiple write operations), the before and after state of the entry being modified is individually self-consistent. i.e. there should be no iommu error triggered when the hw reads a partially-modified entry in that transition period - either translating via the old table or via the new table. Then the one initiating replace will ensure that in-fly DMAs will only target the addresses with same mapping in both old/new tables. Otherwise its own problem. Now let's say a flow in the iommu driver: 1) updates the first stage page pointer (in the 2nd 128bit) 2) updates the DID (in the 1st 128bit) 3) flush iommu cache before cache is flushed, it may contain: - entries tagged with old DID, with content loaded from old table - entries tagged with old DID, with content loaded from new table - entries tagged with new DID, with content loaded from new table Compared to 2nd-stage the only problematic one is old DID + new table. According to 6.2.1 (Tagging of Cached Translations), the root address of page table is not used in tagging and DID-based invalidation will flush all entries related to old DID (no matter it's from old or new table). Then it should just work! p.s. Jason said that atomic size is 128bit on AMD and 64bit on ARM. they both have DID concept and two page table pointers. So I assume it's the same case on this front?
On Wed, Jan 14, 2026 at 07:26:10AM +0000, Tian, Kevin wrote: > before cache is flushed, it may contain: > > - entries tagged with old DID, with content loaded from old table > - entries tagged with old DID, with content loaded from new table > - entries tagged with new DID, with content loaded from new table > > Compared to 2nd-stage the only problematic one is old DID + new table. > > According to 6.2.1 (Tagging of Cached Translations), the root address > of page table is not used in tagging and DID-based invalidation will > flush all entries related to old DID (no matter it's from old or new table). > > Then it should just work! Unless the original domain is attached to another device, then you've corrupted the DID and corrupted IOTLB for the second innocent device that isn't changing translation. > p.s. Jason said that atomic size is 128bit on AMD and 64bit on ARM. > they both have DID concept and two page table pointers. So I assume > it's the same case on this front? Hmm, yeah, ARM has it worse you can't change any ASID/VMID concurrently with the table pointer. You could make a safe algorithm by allocating a temporary ID, moving the current entry to the temporary ID, moving to the new pointer, moving to the final ID, then flushing the tempoary ID. It avoids the cross device issue and your logic above would hold. Or maybe the case Samiullah is interested in should have the new domain adopt the original ID.. Jason
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Wednesday, January 14, 2026 9:17 PM > > On Wed, Jan 14, 2026 at 07:26:10AM +0000, Tian, Kevin wrote: > > before cache is flushed, it may contain: > > > > - entries tagged with old DID, with content loaded from old table > > - entries tagged with old DID, with content loaded from new table > > - entries tagged with new DID, with content loaded from new table > > > > Compared to 2nd-stage the only problematic one is old DID + new table. > > > > According to 6.2.1 (Tagging of Cached Translations), the root address > > of page table is not used in tagging and DID-based invalidation will > > flush all entries related to old DID (no matter it's from old or new table). > > > > Then it should just work! > > Unless the original domain is attached to another device, then you've > corrupted the DID and corrupted IOTLB for the second innocent device > that isn't changing translation. ah, yes, that's the key point! > > > p.s. Jason said that atomic size is 128bit on AMD and 64bit on ARM. > > they both have DID concept and two page table pointers. So I assume > > it's the same case on this front? > > Hmm, yeah, ARM has it worse you can't change any ASID/VMID > concurrently with the table pointer. > > You could make a safe algorithm by allocating a temporary ID, moving > the current entry to the temporary ID, moving to the new pointer, > moving to the final ID, then flushing the tempoary ID. right, that ensures the corrupted state is only associated with the temporary ID. > > It avoids the cross device issue and your logic above would hold. > > Or maybe the case Samiullah is interested in should have the new > domain adopt the original ID.. > that also works. the KHO resume process could have a special DID allocation scheme to reuse the original one. or in Samiullah's case the old/new domains always contains the same mappings, so no corruption would ever occur, but that'd be a very KHO specific assumption. 😊
On Thu, Jan 15, 2026 at 05:44:08AM +0000, Tian, Kevin wrote: > or in Samiullah's case the old/new domains always contains the > same mappings, so no corruption would ever occur, but that'd be > a very KHO specific assumption. 😊 The thing witih KHO is we don't actually know this is true. We hope it is true, but ultimately userspace is responsible to do it, and the kernel doesn't check it. This is why the hitless update in an appealing solution because it means we don't need to deal with trying to adopt an entire page table prepared by another kernel and ensuring this kernel has IOAS areas that fully cover and exactly match what is in the table. Unfortunately to do the extra DID will require a complex invalidation sequence to hook multiple DIDs into the same domain simultaneously.. Nicolin is working on this for ARM and I think Intel's linked list scheme could be tasked to do it too.. Jason
> From: Jason Gunthorpe <jgg@nvidia.com> > Sent: Thursday, January 15, 2026 9:28 PM > > On Thu, Jan 15, 2026 at 05:44:08AM +0000, Tian, Kevin wrote: > > > or in Samiullah's case the old/new domains always contains the > > same mappings, so no corruption would ever occur, but that'd be > > a very KHO specific assumption. 😊 > > The thing witih KHO is we don't actually know this is true. We hope it > is true, but ultimately userspace is responsible to do it, and the > kernel doesn't check it. Make sense. Then the option having the new domain adopt the original ID could not work either. We cannot have one DID tagging two domains which may contain different mappings. > > This is why the hitless update in an appealing solution because it > means we don't need to deal with trying to adopt an entire page table > prepared by another kernel and ensuring this kernel has IOAS areas > that fully cover and exactly match what is in the table. > > Unfortunately to do the extra DID will require a complex invalidation > sequence to hook multiple DIDs into the same domain > simultaneously.. Nicolin is working on this for ARM and I think > Intel's linked list scheme could be tasked to do it too.. > yes
On Wed, Jan 14, 2026 at 5:17 AM Jason Gunthorpe <jgg@nvidia.com> wrote: > > On Wed, Jan 14, 2026 at 07:26:10AM +0000, Tian, Kevin wrote: > > before cache is flushed, it may contain: > > > > - entries tagged with old DID, with content loaded from old table > > - entries tagged with old DID, with content loaded from new table > > - entries tagged with new DID, with content loaded from new table > > > > Compared to 2nd-stage the only problematic one is old DID + new table. > > > > According to 6.2.1 (Tagging of Cached Translations), the root address > > of page table is not used in tagging and DID-based invalidation will > > flush all entries related to old DID (no matter it's from old or new table). > > > > Then it should just work! > > Unless the original domain is attached to another device, then you've > corrupted the DID and corrupted IOTLB for the second innocent device > that isn't changing translation. > > > p.s. Jason said that atomic size is 128bit on AMD and 64bit on ARM. > > they both have DID concept and two page table pointers. So I assume > > it's the same case on this front? > > Hmm, yeah, ARM has it worse you can't change any ASID/VMID > concurrently with the table pointer. > > You could make a safe algorithm by allocating a temporary ID, moving > the current entry to the temporary ID, moving to the new pointer, > moving to the final ID, then flushing the tempoary ID. Interesting Idea. I think this can be done. > > It avoids the cross device issue and your logic above would hold. > > Or maybe the case Samiullah is interested in should have the new > domain adopt the original ID.. There is no immediate use case for first level, as I understand we use second level for all our VM use cases. But looking at the code and specs, the first level is used for pasid compatible domains (using IOMMU_HWPT_ALLOC_PASID flag) and it will not be replaceable hitlessly.. > > Jason
On Wed, Jan 14, 2026 at 10:51:02AM -0800, Samiullah Khawaja wrote: > There is no immediate use case for first level, as I understand we use > second level for all our VM use cases. But looking at the code and > specs, the first level is used for pasid compatible domains (using > IOMMU_HWPT_ALLOC_PASID flag) and it will not be replaceable > hitlessly.. If the VMM allocates using the NEST_PARENT flag it will always get a S2, and that is a reasonable thing for the VMM to do. Jason
On Tue, Jan 13, 2026 at 11:27:30AM -0800, Samiullah Khawaja wrote: > > - Update 'struct pasid_entry' with a union to support 128-bit > > access via the newly added val128[4] array. > > - Add pasid_support_hitless_replace() to determine if a transition > > between an old and new entry is safe to perform atomically. > > - For First-level/Nested translations: The first 128 bits (chunk 0) > > must remain identical; chunk 1 is updated atomically. > > Looking at the specs, the DID is part of the first 128 bits (chunk 0), > so I guess for the first level the hitless replacement would not be > supported since each domain will have a different DID? Ah, yeah, you wont be able to do the KHO replace thing when using a first stage.. Jason
On Tue, Jan 13, 2026 at 11:00:48AM +0800, Lu Baolu wrote:
> +static inline bool pasid_support_hitless_replace(struct pasid_entry *pte,
> + struct pasid_entry *new, int type)
> +{
> + switch (type) {
> + case PASID_ENTRY_PGTT_FL_ONLY:
> + case PASID_ENTRY_PGTT_NESTED:
> + /* The first 128 bits remain the same. */
> + return READ_ONCE(pte->val[0]) == READ_ONCE(new->val[0]) &&
> + READ_ONCE(pte->val[1]) == READ_ONCE(new->val[1]);
pte->val128[0] == new->val128[0]
> + case PASID_ENTRY_PGTT_SL_ONLY:
> + case PASID_ENTRY_PGTT_PT:
> + /* The second 128 bits remain the same. */
> + return READ_ONCE(pte->val[2]) == READ_ONCE(new->val[2]) &&
> + READ_ONCE(pte->val[3]) == READ_ONCE(new->val[3]);
These READ_ONCE's are pointless, especially the ones on new.
With 5 words to worry about I really feel strongly this should just
use the ARM algorithm. It handles everything very elegantly, we can
lift it out of ARM and make it general.
Here, I did a quick refactoring into general code:
https://github.com/jgunthorpe/linux/commits/for-baolu/
You just need to provide a used function to compute which bits HW is
not ignoring and a sync function to push the invalidation command. It
will take care of all sequencing needs for all possible new/old
combinations.
Then delete the replace/not replace split in the code too.
Jason
On 1/13/26 23:05, Jason Gunthorpe wrote:
> On Tue, Jan 13, 2026 at 11:00:48AM +0800, Lu Baolu wrote:
>> +static inline bool pasid_support_hitless_replace(struct pasid_entry *pte,
>> + struct pasid_entry *new, int type)
>> +{
>> + switch (type) {
>> + case PASID_ENTRY_PGTT_FL_ONLY:
>> + case PASID_ENTRY_PGTT_NESTED:
>> + /* The first 128 bits remain the same. */
>> + return READ_ONCE(pte->val[0]) == READ_ONCE(new->val[0]) &&
>> + READ_ONCE(pte->val[1]) == READ_ONCE(new->val[1]);
>
> pte->val128[0] == new->val128[0]
>
>> + case PASID_ENTRY_PGTT_SL_ONLY:
>> + case PASID_ENTRY_PGTT_PT:
>> + /* The second 128 bits remain the same. */
>> + return READ_ONCE(pte->val[2]) == READ_ONCE(new->val[2]) &&
>> + READ_ONCE(pte->val[3]) == READ_ONCE(new->val[3]);
>
> These READ_ONCE's are pointless, especially the ones on new.
>
> With 5 words to worry about I really feel strongly this should just
> use the ARM algorithm. It handles everything very elegantly, we can
> lift it out of ARM and make it general.
>
> Here, I did a quick refactoring into general code:
>
> https://github.com/jgunthorpe/linux/commits/for-baolu/
>
> You just need to provide a used function to compute which bits HW is
> not ignoring and a sync function to push the invalidation command. It
> will take care of all sequencing needs for all possible new/old
> combinations.
It's always good to generate common library code to avoid boilerplate
code and different behaviors across multiple drivers. I'll try to
integrate this into the next version. Thanks for the help!
>
> Then delete the replace/not replace split in the code too.
>
> Jason
Thanks,
baolu
© 2016 - 2026 Red Hat, Inc.