[PATCH 2/8] iommu/vt-d: Add entry_sync support for PASID entry updates

Lu Baolu posted 8 patches 1 month ago
[PATCH 2/8] iommu/vt-d: Add entry_sync support for PASID entry updates
Posted by Lu Baolu 1 month ago
Updating PASID table entries while the device hardware is possibly
performing DMA concurrently is complex. Traditionally, this required
a "clear-then-update" approach — clearing the Present bit, flushing
caches, updating the entry, and then restoring the Present bit. This
causes unnecessary latency or interruptions for transactions that might
not even be affected by the specific bits being changed.

Plumb this driver into the generic entry_sync library to modernize
this process. The library uses the concept of "Used bits" to determine
if a transition can be performed "hitlessly" (via a single atomic
128-bit swap) or if a disruptive 3-step update is truly required.

The implementation includes:

- intel_pasid_get_used(): Defines which bits the IOMMU hardware is
  sensitive to based on the PGTT.
- intel_pasid_sync(): Handles the required clflushes, PASID cache
  invalidations, and IOTLB/Dev-TLB flushes required between update
  steps.
- 128-bit atomicity: Depends on IOMMU_ENTRY_SYNC128 to ensure that
  256-bit PASID entries are updated in atomic 128-bit quanta,
  preventing the hardware from ever seeing a "torn" entry.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 drivers/iommu/intel/Kconfig |   2 +
 drivers/iommu/intel/pasid.c | 173 ++++++++++++++++++++++++++++++++++++
 2 files changed, 175 insertions(+)

diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 5471f814e073..7fa31b9d4ef4 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -26,6 +26,8 @@ config INTEL_IOMMU
 	select PCI_ATS
 	select PCI_PRI
 	select PCI_PASID
+	select IOMMU_ENTRY_SYNC
+	select IOMMU_ENTRY_SYNC128
 	help
 	  DMA remapping (DMAR) devices support enables independent address
 	  translations for Direct Memory Access (DMA) from devices.
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 9d30015b8940..5b9eb5c8f42d 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -21,12 +21,185 @@
 #include "iommu.h"
 #include "pasid.h"
 #include "../iommu-pages.h"
+#include "../entry_sync.h"
 
 /*
  * Intel IOMMU system wide PASID name space:
  */
 u32 intel_pasid_max_id = PASID_MAX;
 
+/*
+ * Plumb into the generic entry_sync library:
+ */
+static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid);
+static void pasid_flush_caches(struct intel_iommu *iommu, struct pasid_entry *pte,
+			       u32 pasid, u16 did);
+static void intel_pasid_flush_present(struct intel_iommu *iommu, struct device *dev,
+				      u32 pasid, u16 did, struct pasid_entry *pte);
+static void pasid_cache_invalidation_with_pasid(struct intel_iommu *iommu,
+						u16 did, u32 pasid);
+static void devtlb_invalidation_with_pasid(struct intel_iommu *iommu,
+					   struct device *dev, u32 pasid);
+
+struct intel_pasid_writer {
+	struct entry_sync_writer128 writer;
+	struct intel_iommu *iommu;
+	struct device *dev;
+	u32 pasid;
+	struct pasid_entry orig_pte;
+	bool was_present;
+};
+
+/*
+ * Identify which bits of the 256-bit entry the HW is using. The "Used" bits
+ * are those that, if changed, would cause the IOMMU to behave differently
+ * for an active transaction.
+ */
+static void intel_pasid_get_used(const u128 *entry, u128 *used)
+{
+	struct pasid_entry *pe = (struct pasid_entry *)entry;
+	struct pasid_entry *ue = (struct pasid_entry *)used;
+	u16 pgtt;
+
+	/* Initialize used bits to 0. */
+	memset(ue, 0, sizeof(*ue));
+
+	/* Present bit always matters. */
+	ue->val[0] |= PASID_PTE_PRESENT;
+
+	/* Nothing more for non-present entries. */
+	if (!(pe->val[0] & PASID_PTE_PRESENT))
+		return;
+
+	pgtt = pasid_pte_get_pgtt(pe);
+	switch (pgtt) {
+	case PASID_ENTRY_PGTT_FL_ONLY:
+		/* AW, PGTT */
+		ue->val[0] |= GENMASK_ULL(4, 2) | GENMASK_ULL(8, 6);
+		/* DID, PWSNP, PGSNP */
+		ue->val[1] |= GENMASK_ULL(24, 23) | GENMASK_ULL(15, 0);
+		/* FSPTPTR, FSPM */
+		ue->val[2] |= GENMASK_ULL(63, 12) | GENMASK_ULL(3, 2);
+		break;
+	case PASID_ENTRY_PGTT_NESTED:
+		/* FPD, AW, PGTT, SSADE, SSPTPTR*/
+		ue->val[0] |= GENMASK_ULL(63, 12) | GENMASK_ULL(9, 6) |
+				GENMASK_ULL(4, 1);
+		/* PGSNP, DID, PWSNP */
+		ue->val[1] |= GENMASK_ULL(24, 23) | GENMASK_ULL(15, 0);
+		/* FSPTPTR, FSPM, EAFE, WPE, SRE */
+		ue->val[2] |= GENMASK_ULL(63, 12) | BIT_ULL(7) |
+				GENMASK_ULL(4, 2) | BIT_ULL(0);
+		break;
+	case PASID_ENTRY_PGTT_SL_ONLY:
+		/* FPD, AW, PGTT, SSADE, SSPTPTR */
+		ue->val[0] |= GENMASK_ULL(63, 12) | GENMASK_ULL(9, 6) |
+				GENMASK_ULL(4, 1);
+		/* DID, PWSNP */
+		ue->val[1] |= GENMASK_ULL(15, 0) | BIT_ULL(23);
+		break;
+	case PASID_ENTRY_PGTT_PT:
+		/* FPD, AW, PGTT */
+		ue->val[0] |= GENMASK_ULL(4, 2) | GENMASK_ULL(8, 6) | BIT_ULL(1);
+		/* DID, PWSNP */
+		ue->val[1] |= GENMASK_ULL(15, 0) | BIT_ULL(23);
+		break;
+	default:
+		WARN_ON(true);
+	}
+}
+
+static void intel_pasid_sync(struct entry_sync_writer128 *writer)
+{
+	struct intel_pasid_writer *p_writer = container_of(writer,
+			struct intel_pasid_writer, writer);
+	struct intel_iommu *iommu = p_writer->iommu;
+	struct device *dev = p_writer->dev;
+	bool was_present, is_present;
+	u32 pasid = p_writer->pasid;
+	struct pasid_entry *pte;
+	u16 old_did, old_pgtt;
+
+	pte = intel_pasid_get_entry(dev, pasid);
+	was_present = p_writer->was_present;
+	is_present = pasid_pte_is_present(pte);
+	old_did = pasid_get_domain_id(&p_writer->orig_pte);
+	old_pgtt = pasid_pte_get_pgtt(&p_writer->orig_pte);
+
+	/* Update the last present state: */
+	p_writer->was_present = is_present;
+
+	if (!ecap_coherent(iommu->ecap))
+		clflush_cache_range(pte, sizeof(*pte));
+
+	/* Sync for "P=0" to "P=1": */
+	if (!was_present) {
+		if (is_present)
+			pasid_flush_caches(iommu, pte, pasid,
+					   pasid_get_domain_id(pte));
+
+		return;
+	}
+
+	/* Sync for "P=1" to "P=1": */
+	if (is_present) {
+		intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
+		return;
+	}
+
+	/* Sync for "P=1" to "P=0": */
+	pasid_cache_invalidation_with_pasid(iommu, old_did, pasid);
+
+	if (old_pgtt == PASID_ENTRY_PGTT_PT || old_pgtt == PASID_ENTRY_PGTT_FL_ONLY)
+		qi_flush_piotlb(iommu, old_did, pasid, 0, -1, 0);
+	else
+		iommu->flush.flush_iotlb(iommu, old_did, 0, 0, DMA_TLB_DSI_FLUSH);
+
+	devtlb_invalidation_with_pasid(iommu, dev, pasid);
+}
+
+static const struct entry_sync_writer_ops128 writer_ops128 = {
+	.get_used = intel_pasid_get_used,
+	.sync = intel_pasid_sync,
+};
+
+#define INTEL_PASID_SYNC_MEM_COUNT	12
+
+static int __maybe_unused intel_pasid_write(struct intel_iommu *iommu,
+					    struct device *dev, u32 pasid,
+					    u128 *target)
+{
+	struct pasid_entry *pte = intel_pasid_get_entry(dev, pasid);
+	struct intel_pasid_writer p_writer = {
+		.writer = {
+			.ops = &writer_ops128,
+			/* 512 bits total (4 * 128-bit chunks) */
+			.num_quantas = 4,
+			/* The 'P' bit is in the first 128-bit chunk */
+			.vbit_quanta = 0,
+		},
+		.iommu = iommu,
+		.dev = dev,
+		.pasid = pasid,
+	};
+	u128 memory[INTEL_PASID_SYNC_MEM_COUNT];
+
+	if (!pte)
+		return -ENODEV;
+
+	p_writer.orig_pte = *pte;
+	p_writer.was_present = pasid_pte_is_present(pte);
+
+	/*
+	 * The library now does the heavy lifting:
+	 * 1. Checks if it can do a 1-quanta hitless flip.
+	 * 2. If not, it does a 3-step V=0 (disruptive) update.
+	 */
+	entry_sync_write128(&p_writer.writer, (u128 *)pte, target, memory, sizeof(memory));
+
+	return 0;
+}
+
 /*
  * Per device pasid table management:
  */
-- 
2.43.0

Re: [PATCH 2/8] iommu/vt-d: Add entry_sync support for PASID entry updates
Posted by Jason Gunthorpe 1 month ago
On Mon, Mar 09, 2026 at 02:06:42PM +0800, Lu Baolu wrote:
> +static void intel_pasid_get_used(const u128 *entry, u128 *used)
> +{
> +	struct pasid_entry *pe = (struct pasid_entry *)entry;
> +	struct pasid_entry *ue = (struct pasid_entry *)used;
> +	u16 pgtt;
> +
> +	/* Initialize used bits to 0. */
> +	memset(ue, 0, sizeof(*ue));
> +
> +	/* Present bit always matters. */
> +	ue->val[0] |= PASID_PTE_PRESENT;
> +
> +	/* Nothing more for non-present entries. */
> +	if (!(pe->val[0] & PASID_PTE_PRESENT))
> +		return;
> +
> +	pgtt = pasid_pte_get_pgtt(pe);
> +	switch (pgtt) {
> +	case PASID_ENTRY_PGTT_FL_ONLY:
> +		/* AW, PGTT */
> +		ue->val[0] |= GENMASK_ULL(4, 2) | GENMASK_ULL(8, 6);
> +		/* DID, PWSNP, PGSNP */
> +		ue->val[1] |= GENMASK_ULL(24, 23) | GENMASK_ULL(15, 0);
> +		/* FSPTPTR, FSPM */
> +		ue->val[2] |= GENMASK_ULL(63, 12) | GENMASK_ULL(3, 2);

This would be an excellent time to properly add these constants :(

/* 9.6 Scalable-Mode PASID Table Entry */
#define SM_PASID0_P		BIT_U64(0)
#define SM_PASID0_FPD		BIT_U64(1)
#define SM_PASID0_AW		GENMASK_U64(4, 2)
#define SM_PASID0_SSEE		BIT_U64(5)
#define SM_PASID0_PGTT		GENMASK_U64(8, 6)
#define SM_PASID0_SSADE		BIT_U64(9)
#define SM_PASID0_SSPTPTR	GENMASK_U64(63, 12)

#define SM_PASID1_DID		GENMASK_U64(15, 0)
#define SM_PASID1_PWSNP		BIT_U64(23)
#define SM_PASID1_PGSNP		BIT_U64(24)
#define SM_PASID1_CD		BIT_U64(25)
#define SM_PASID1_EMTE		BIT_U64(26)
#define SM_PASID1_PAT		GENMASK_U64(63, 32)

#define SM_PASID2_SRE		BIT_U64(0)
#define SM_PASID2_ERE		BIT_U64(1)
#define SM_PASID2_FSPM		GENMASK_U64(3, 2)
#define SM_PASID2_WPE		BIT_U64(4)
#define SM_PASID2_NXE		BIT_U64(5)
#define SM_PASID2_SMEP		BIT_U64(6)
#define SM_PASID2_EAFE		BIT_U64(7)
#define SM_PASID2_FSPTPTR	GENMASK_U64(63, 12)

> +static void intel_pasid_sync(struct entry_sync_writer128 *writer)
> +{
> +	struct intel_pasid_writer *p_writer = container_of(writer,
> +			struct intel_pasid_writer, writer);
> +	struct intel_iommu *iommu = p_writer->iommu;
> +	struct device *dev = p_writer->dev;
> +	bool was_present, is_present;
> +	u32 pasid = p_writer->pasid;
> +	struct pasid_entry *pte;
> +	u16 old_did, old_pgtt;
> +
> +	pte = intel_pasid_get_entry(dev, pasid);
> +	was_present = p_writer->was_present;
> +	is_present = pasid_pte_is_present(pte);
> +	old_did = pasid_get_domain_id(&p_writer->orig_pte);
> +	old_pgtt = pasid_pte_get_pgtt(&p_writer->orig_pte);
> +
> +	/* Update the last present state: */
> +	p_writer->was_present = is_present;
> +
> +	if (!ecap_coherent(iommu->ecap))
> +		clflush_cache_range(pte, sizeof(*pte));
> +
> +	/* Sync for "P=0" to "P=1": */
> +	if (!was_present) {
> +		if (is_present)
> +			pasid_flush_caches(iommu, pte, pasid,
> +					   pasid_get_domain_id(pte));
> +
> +		return;
> +	}
> +
> +	/* Sync for "P=1" to "P=1": */
> +	if (is_present) {
> +		intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
> +		return;
> +	}
> +
> +	/* Sync for "P=1" to "P=0": */
> +	pasid_cache_invalidation_with_pasid(iommu, old_did, pasid);

Why all this logic? All this different stuff does is meddle with the
IOTLB and it should not seen below.

If the sync is called it should just always call
pasid_cache_invalidation_with_pasid(), that's it.

Writer has already eliminated all cases where sync isn't needed.

> +	if (old_pgtt == PASID_ENTRY_PGTT_PT || old_pgtt == PASID_ENTRY_PGTT_FL_ONLY)
> +		qi_flush_piotlb(iommu, old_did, pasid, 0, -1, 0);
> +	else
> +		iommu->flush.flush_iotlb(iommu, old_did, 0, 0, DMA_TLB_DSI_FLUSH);
> +	devtlb_invalidation_with_pasid(iommu, dev, pasid);

The IOTLB should already be clean'd before the new entry using the
cache tag is programmed. Cleaning it after the entry is live is buggy.

The writer logic ensures it never sees a corrupted entry, so the clean
cache tag cannot be mangled during the writing process.

The way ARM is structured has the cache tags clean if they are in the
allocator bitmap, so when the driver fetches a new tag and starts
using it is clean and non cleaning is needed

When it frees a tag it cleans it and then returns it to the allocator.

ATC invalidations should always be done after the PASID entry is
written. During a hitless update both translations are unpredictably
combined, this is unavoidable and OK.

Jason
Re: [PATCH 2/8] iommu/vt-d: Add entry_sync support for PASID entry updates
Posted by Baolu Lu 4 weeks ago
On 3/9/26 21:41, Jason Gunthorpe wrote:
>> +static void intel_pasid_sync(struct entry_sync_writer128 *writer)
>> +{
>> +	struct intel_pasid_writer *p_writer = container_of(writer,
>> +			struct intel_pasid_writer, writer);
>> +	struct intel_iommu *iommu = p_writer->iommu;
>> +	struct device *dev = p_writer->dev;
>> +	bool was_present, is_present;
>> +	u32 pasid = p_writer->pasid;
>> +	struct pasid_entry *pte;
>> +	u16 old_did, old_pgtt;
>> +
>> +	pte = intel_pasid_get_entry(dev, pasid);
>> +	was_present = p_writer->was_present;
>> +	is_present = pasid_pte_is_present(pte);
>> +	old_did = pasid_get_domain_id(&p_writer->orig_pte);
>> +	old_pgtt = pasid_pte_get_pgtt(&p_writer->orig_pte);
>> +
>> +	/* Update the last present state: */
>> +	p_writer->was_present = is_present;
>> +
>> +	if (!ecap_coherent(iommu->ecap))
>> +		clflush_cache_range(pte, sizeof(*pte));
>> +
>> +	/* Sync for "P=0" to "P=1": */
>> +	if (!was_present) {
>> +		if (is_present)
>> +			pasid_flush_caches(iommu, pte, pasid,
>> +					   pasid_get_domain_id(pte));
>> +
>> +		return;
>> +	}
>> +
>> +	/* Sync for "P=1" to "P=1": */
>> +	if (is_present) {
>> +		intel_pasid_flush_present(iommu, dev, pasid, old_did, pte);
>> +		return;
>> +	}
>> +
>> +	/* Sync for "P=1" to "P=0": */
>> +	pasid_cache_invalidation_with_pasid(iommu, old_did, pasid);
> Why all this logic? All this different stuff does is meddle with the
> IOTLB and it should not seen below.
> 
> If the sync is called it should just always call
> pasid_cache_invalidation_with_pasid(), that's it.
> 
> Writer has already eliminated all cases where sync isn't needed.

You're right. The library should simplify things. I will remove the
state tracking. The callback will only ensure that memory is flushed
(for non-coherent mode) and the relevant PASID cache is invalidated.

> 
>> +	if (old_pgtt == PASID_ENTRY_PGTT_PT || old_pgtt == PASID_ENTRY_PGTT_FL_ONLY)
>> +		qi_flush_piotlb(iommu, old_did, pasid, 0, -1, 0);
>> +	else
>> +		iommu->flush.flush_iotlb(iommu, old_did, 0, 0, DMA_TLB_DSI_FLUSH);
>> +	devtlb_invalidation_with_pasid(iommu, dev, pasid);
> The IOTLB should already be clean'd before the new entry using the
> cache tag is programmed. Cleaning it after the entry is live is buggy.
 > > The writer logic ensures it never sees a corrupted entry, so the clean
> cache tag cannot be mangled during the writing process.
> 
> The way ARM is structured has the cache tags clean if they are in the
> allocator bitmap, so when the driver fetches a new tag and starts
> using it is clean and non cleaning is needed
> 
> When it frees a tag it cleans it and then returns it to the allocator.

If I understand your remark correctly, the driver should only need the
following in the sync callback:

- clflush (if non-coherent) to ensure the entry is in physical memory.
- PASID cache invalidation to force the hardware to re-read the entry.
- Device-TLB invalidation to drop local device caches.

Does that sound right? I can move the general IOTLB/PIOTLB invalidation
logic to the domain detach/free paths.

> ATC invalidations should always be done after the PASID entry is
> written. During a hitless update both translations are unpredictably
> combined, this is unavoidable and OK.

The VT-d spec (Sections 6.5.2.5 and 6.5.2.6) explicitly mandates that an
IOTLB invalidation must precede the Device-TLB invalidation. If we only
do the device-TLB invalidation in the sync callback, we risk the device
re-fetching a stale translation from the IOMMU's internal IOTLB.

Thanks,
baolu
Re: [PATCH 2/8] iommu/vt-d: Add entry_sync support for PASID entry updates
Posted by Jason Gunthorpe 4 weeks ago
On Thu, Mar 12, 2026 at 03:50:03PM +0800, Baolu Lu wrote:
> If I understand your remark correctly, the driver should only need the
> following in the sync callback:
> 
> - clflush (if non-coherent) to ensure the entry is in physical memory.
> - PASID cache invalidation to force the hardware to re-read the entry.

Yes

> - Device-TLB invalidation to drop local device caches.

I have prefered to keep this outside the entry_set system since it has
nothing to do with updating the context entry.

There should be only one ATS flush after the new entry is installed.

> > ATC invalidations should always be done after the PASID entry is
> > written. During a hitless update both translations are unpredictably
> > combined, this is unavoidable and OK.
> 
> The VT-d spec (Sections 6.5.2.5 and 6.5.2.6) explicitly mandates that an
> IOTLB invalidation must precede the Device-TLB invalidation. If we only
> do the device-TLB invalidation in the sync callback, we risk the device
> re-fetching a stale translation from the IOMMU's internal IOTLB.

It is a little weird that is says that, that is worth checking into.

The other text is clear that the IOTLB is cached by DID,PASID only, so
if the new PASID entry has a DID,PASID which is already coherent in
the IOTLB it should not need any IOTLB flushing.

ie flushing the PASID table should immediately change any ATC fetches
from using DID,old_PASID to DID,new_PASID.

If there is some issue where the PASID flush doesn't fence everything
(ie an ATC fetch of DID,old_PASID can be passed by an ATC invalidation)
then you may need IOTLB invalidations not to manage coherence but to
manage ordering. That is an important detail if true.

Jason
Re: [PATCH 2/8] iommu/vt-d: Add entry_sync support for PASID entry updates
Posted by Baolu Lu 3 weeks, 4 days ago
On 3/12/26 19:44, Jason Gunthorpe wrote:
> On Thu, Mar 12, 2026 at 03:50:03PM +0800, Baolu Lu wrote:
>> If I understand your remark correctly, the driver should only need the
>> following in the sync callback:
>>
>> - clflush (if non-coherent) to ensure the entry is in physical memory.
>> - PASID cache invalidation to force the hardware to re-read the entry.
> 
> Yes
> 
>> - Device-TLB invalidation to drop local device caches.
> 
> I have prefered to keep this outside the entry_set system since it has
> nothing to do with updating the context entry.
> 
> There should be only one ATS flush after the new entry is installed.

Okay, I will move the devtlb_invalidation_with_pasid() calls outside of
the entry_sync system, right after the call to the writer returns.

> 
>>> ATC invalidations should always be done after the PASID entry is
>>> written. During a hitless update both translations are unpredictably
>>> combined, this is unavoidable and OK.
>>
>> The VT-d spec (Sections 6.5.2.5 and 6.5.2.6) explicitly mandates that an
>> IOTLB invalidation must precede the Device-TLB invalidation. If we only
>> do the device-TLB invalidation in the sync callback, we risk the device
>> re-fetching a stale translation from the IOMMU's internal IOTLB.
> 
> It is a little weird that is says that, that is worth checking into.
> 
> The other text is clear that the IOTLB is cached by DID,PASID only, so
> if the new PASID entry has a DID,PASID which is already coherent in
> the IOTLB it should not need any IOTLB flushing.
> 
> ie flushing the PASID table should immediately change any ATC fetches
> from using DID,old_PASID to DID,new_PASID.
> 
> If there is some issue where the PASID flush doesn't fence everything
> (ie an ATC fetch of DID,old_PASID can be passed by an ATC invalidation)
> then you may need IOTLB invalidations not to manage coherence but to
> manage ordering. That is an important detail if true.

On Intel hardware, the PASID-cache and IOTLB are not inclusive. A PASID-
cache invalidation forces a re-fetch of the pasid entry, but it does not
automatically purge downstream IOTLB entries. The spec-mandated IOTLB
flush serves as a synchronization barrier to ensure that in-flight
translation requests are drained and the internal IOMMU state is
consistent before the invalidation request is sent over PCIe to the
device's ATC.

Without this "IOTLB -> Wait Descriptor -> ATC" sequence, there is a risk
that the device re-populates its ATC from a stale entry still residing
in the IOMMU's internal IOTLB, even after the PASID entry itself has
been updated.

Thanks,
baolu
Re: [PATCH 2/8] iommu/vt-d: Add entry_sync support for PASID entry updates
Posted by Jason Gunthorpe 2 weeks, 2 days ago
On Sun, Mar 15, 2026 at 04:11:36PM +0800, Baolu Lu wrote:
> > > > ATC invalidations should always be done after the PASID entry is
> > > > written. During a hitless update both translations are unpredictably
> > > > combined, this is unavoidable and OK.
> > > 
> > > The VT-d spec (Sections 6.5.2.5 and 6.5.2.6) explicitly mandates that an
> > > IOTLB invalidation must precede the Device-TLB invalidation. If we only
> > > do the device-TLB invalidation in the sync callback, we risk the device
> > > re-fetching a stale translation from the IOMMU's internal IOTLB.
> > 
> > It is a little weird that is says that, that is worth checking into.
> > 
> > The other text is clear that the IOTLB is cached by DID,PASID only, so
> > if the new PASID entry has a DID,PASID which is already coherent in
> > the IOTLB it should not need any IOTLB flushing.
> > 
> > ie flushing the PASID table should immediately change any ATC fetches
> > from using DID,old_PASID to DID,new_PASID.
> > 
> > If there is some issue where the PASID flush doesn't fence everything
> > (ie an ATC fetch of DID,old_PASID can be passed by an ATC invalidation)
> > then you may need IOTLB invalidations not to manage coherence but to
> > manage ordering. That is an important detail if true.
> 
> On Intel hardware, the PASID-cache and IOTLB are not inclusive. A PASID-
> cache invalidation forces a re-fetch of the pasid entry, but it does not
> automatically purge downstream IOTLB entries.

It doesn't matter, the updated PASID entry will point to a new DID and
the IOTLB (new DID,PASID) entry will be valid in the IOTLB.

We don't need to flush the IOTLB, we just need to ensure that all
lookups done with (old DID,PASID) are completed before sending any
invalidation.

> The spec-mandated IOTLB flush serves as a synchronization barrier to
> ensure that in-flight translation requests are drained and the
> internal IOMMU state is consistent before the invalidation request
> is sent over PCIe to the device's ATC.

A fencing requirement does make sense, but does it have to be done by
flushing the entire DID,PASID? It is ugly to have to drop the IOTLB
just because a context entry changed.

Can you do a 4k IOVA 0 invalidation and get the same fence?

Jason
Re: [PATCH 2/8] iommu/vt-d: Add entry_sync support for PASID entry updates
Posted by Baolu Lu 2 weeks, 2 days ago
On 3/23/26 21:07, Jason Gunthorpe wrote:
> On Sun, Mar 15, 2026 at 04:11:36PM +0800, Baolu Lu wrote:
>>>>> ATC invalidations should always be done after the PASID entry is
>>>>> written. During a hitless update both translations are unpredictably
>>>>> combined, this is unavoidable and OK.
>>>>
>>>> The VT-d spec (Sections 6.5.2.5 and 6.5.2.6) explicitly mandates that an
>>>> IOTLB invalidation must precede the Device-TLB invalidation. If we only
>>>> do the device-TLB invalidation in the sync callback, we risk the device
>>>> re-fetching a stale translation from the IOMMU's internal IOTLB.
>>>
>>> It is a little weird that is says that, that is worth checking into.
>>>
>>> The other text is clear that the IOTLB is cached by DID,PASID only, so
>>> if the new PASID entry has a DID,PASID which is already coherent in
>>> the IOTLB it should not need any IOTLB flushing.
>>>
>>> ie flushing the PASID table should immediately change any ATC fetches
>>> from using DID,old_PASID to DID,new_PASID.
>>>
>>> If there is some issue where the PASID flush doesn't fence everything
>>> (ie an ATC fetch of DID,old_PASID can be passed by an ATC invalidation)
>>> then you may need IOTLB invalidations not to manage coherence but to
>>> manage ordering. That is an important detail if true.
>>
>> On Intel hardware, the PASID-cache and IOTLB are not inclusive. A PASID-
>> cache invalidation forces a re-fetch of the pasid entry, but it does not
>> automatically purge downstream IOTLB entries.
> 
> It doesn't matter, the updated PASID entry will point to a new DID and
> the IOTLB (new DID,PASID) entry will be valid in the IOTLB.
> 
> We don't need to flush the IOTLB, we just need to ensure that all
> lookups done with (old DID,PASID) are completed before sending any
> invalidation.

Yes, you are right.

> 
>> The spec-mandated IOTLB flush serves as a synchronization barrier to
>> ensure that in-flight translation requests are drained and the
>> internal IOMMU state is consistent before the invalidation request
>> is sent over PCIe to the device's ATC.
> 
> A fencing requirement does make sense, but does it have to be done by
> flushing the entire DID,PASID? It is ugly to have to drop the IOTLB
> just because a context entry changed.

I believe the full [old_DID, PASID] invalidation is a functional
necessity rather than just a fencing requirement. Even though the new
PASID entry points to a new_DID, leaving stale translations tagged with
[old_DID, PASID] in the IOTLB is problematic.

However, I agree that IOTLB and Device-TLB invalidation should not be
part of the entry_sync for a PASID entry; instead, it belongs in the
domain replacement logic.

> Can you do a 4k IOVA 0 invalidation and get the same fence?
> 
> Jason

Thanks,
baolu
Re: [PATCH 2/8] iommu/vt-d: Add entry_sync support for PASID entry updates
Posted by Jason Gunthorpe 2 weeks, 1 day ago
On Tue, Mar 24, 2026 at 02:22:21PM +0800, Baolu Lu wrote:
> > A fencing requirement does make sense, but does it have to be done by
> > flushing the entire DID,PASID? It is ugly to have to drop the IOTLB
> > just because a context entry changed.
> 
> I believe the full [old_DID, PASID] invalidation is a functional
> necessity rather than just a fencing requirement. Even though the new
> PASID entry points to a new_DID, leaving stale translations tagged with
> [old_DID, PASID] in the IOTLB is problematic.

You don't know they are stale, the old_DID,PASID could be used by
another context entry.

The proper time to delcare an IOTLB tag as stale is when it is
returned back to the allocator.

The problem in the VT-d design is that it is complicated to manage the
DID lifecycle well - but the driver itself should have a clear idea
when tags are end of life.

Yes, it is a very reasonable simplified design for the driver to say
the DID lifecycle ends at every context entry change, but that is very
different from stating the flush as a mandatory requirement from the
HW spec.

A more complex design could have each context entry request a cache
tag for the context entry, for the specific domain, and share the tags
whenever possible, eg for SVA and S2 domains.

Jason
Re: [PATCH 2/8] iommu/vt-d: Add entry_sync support for PASID entry updates
Posted by Baolu Lu 4 weeks, 1 day ago
On 3/9/26 21:41, Jason Gunthorpe wrote:
> On Mon, Mar 09, 2026 at 02:06:42PM +0800, Lu Baolu wrote:
>> +static void intel_pasid_get_used(const u128 *entry, u128 *used)
>> +{
>> +	struct pasid_entry *pe = (struct pasid_entry *)entry;
>> +	struct pasid_entry *ue = (struct pasid_entry *)used;
>> +	u16 pgtt;
>> +
>> +	/* Initialize used bits to 0. */
>> +	memset(ue, 0, sizeof(*ue));
>> +
>> +	/* Present bit always matters. */
>> +	ue->val[0] |= PASID_PTE_PRESENT;
>> +
>> +	/* Nothing more for non-present entries. */
>> +	if (!(pe->val[0] & PASID_PTE_PRESENT))
>> +		return;
>> +
>> +	pgtt = pasid_pte_get_pgtt(pe);
>> +	switch (pgtt) {
>> +	case PASID_ENTRY_PGTT_FL_ONLY:
>> +		/* AW, PGTT */
>> +		ue->val[0] |= GENMASK_ULL(4, 2) | GENMASK_ULL(8, 6);
>> +		/* DID, PWSNP, PGSNP */
>> +		ue->val[1] |= GENMASK_ULL(24, 23) | GENMASK_ULL(15, 0);
>> +		/* FSPTPTR, FSPM */
>> +		ue->val[2] |= GENMASK_ULL(63, 12) | GENMASK_ULL(3, 2);
> This would be an excellent time to properly add these constants 🙁
> 
> /* 9.6 Scalable-Mode PASID Table Entry */
> #define SM_PASID0_P		BIT_U64(0)
> #define SM_PASID0_FPD		BIT_U64(1)
> #define SM_PASID0_AW		GENMASK_U64(4, 2)
> #define SM_PASID0_SSEE		BIT_U64(5)
> #define SM_PASID0_PGTT		GENMASK_U64(8, 6)
> #define SM_PASID0_SSADE		BIT_U64(9)
> #define SM_PASID0_SSPTPTR	GENMASK_U64(63, 12)
> 
> #define SM_PASID1_DID		GENMASK_U64(15, 0)
> #define SM_PASID1_PWSNP		BIT_U64(23)
> #define SM_PASID1_PGSNP		BIT_U64(24)
> #define SM_PASID1_CD		BIT_U64(25)
> #define SM_PASID1_EMTE		BIT_U64(26)
> #define SM_PASID1_PAT		GENMASK_U64(63, 32)
> 
> #define SM_PASID2_SRE		BIT_U64(0)
> #define SM_PASID2_ERE		BIT_U64(1)
> #define SM_PASID2_FSPM		GENMASK_U64(3, 2)
> #define SM_PASID2_WPE		BIT_U64(4)
> #define SM_PASID2_NXE		BIT_U64(5)
> #define SM_PASID2_SMEP		BIT_U64(6)
> #define SM_PASID2_EAFE		BIT_U64(7)
> #define SM_PASID2_FSPTPTR	GENMASK_U64(63, 12)

Yeah, code updated like this,

drivers/iommu/intel/pasid.h:

/* 9.6 Scalable-Mode PASID Table Entry */
#define SM_PASID0_P             BIT_U64(0)
#define SM_PASID0_FPD           BIT_U64(1)
#define SM_PASID0_AW            GENMASK_U64(4, 2)
#define SM_PASID0_PGTT          GENMASK_U64(8, 6)
#define SM_PASID0_SSADE         BIT_U64(9)
#define SM_PASID0_SSPTPTR       GENMASK_U64(63, 12)

#define SM_PASID1_DID           GENMASK_U64(15, 0)
#define SM_PASID1_PWSNP         BIT_U64(23)
#define SM_PASID1_PGSNP         BIT_U64(24)
#define SM_PASID1_CD            BIT_U64(25)
#define SM_PASID1_EMTE          BIT_U64(26)
#define SM_PASID1_PAT           GENMASK_U64(63, 32)

#define SM_PASID2_SRE           BIT_U64(0)
#define SM_PASID2_FSPM          GENMASK_U64(3, 2)
#define SM_PASID2_WPE           BIT_U64(4)
#define SM_PASID2_EAFE          BIT_U64(7)
#define SM_PASID2_FSPTPTR       GENMASK_U64(63, 12)

drivers/iommu/intel/pasid.c:

static void intel_pasid_get_used(const u128 *entry, u128 *used)
{
         struct pasid_entry *pe = (struct pasid_entry *)entry;
         struct pasid_entry *ue = (struct pasid_entry *)used;
         u16 pgtt;

         /* Initialize used bits to 0. */
         memset(ue, 0, sizeof(*ue));

         /* Present bit always matters. */
         ue->val[0] |= SM_PASID0_P;

         /* Nothing more for non-present entries. */
         if (!(pe->val[0] & SM_PASID0_P)) {
                 trace_entry_get_used(entry, used);
                 return;
         }

         pgtt = pasid_pte_get_pgtt(pe);
         switch (pgtt) {
         case PASID_ENTRY_PGTT_FL_ONLY:
                 /* AW, PGTT */
                 ue->val[0] |= SM_PASID0_AW | SM_PASID0_PGTT;
                 /* DID, PWSNP, PGSNP */
                 ue->val[1] |= SM_PASID1_DID | SM_PASID1_PWSNP | 
SM_PASID1_PGSNP;
                 /* FSPTPTR, FSPM */
                 ue->val[2] |= SM_PASID2_FSPTPTR | SM_PASID2_FSPM;
                 break;
         case PASID_ENTRY_PGTT_NESTED:
                 /* FPD, AW, PGTT, SSADE, SSPTPTR*/
                 ue->val[0] |= SM_PASID0_FPD | SM_PASID0_AW | 
SM_PASID0_PGTT |
                                 SM_PASID0_SSADE | SM_PASID0_SSPTPTR;
                 /* PGSNP, DID, PWSNP */
                 ue->val[1] |= SM_PASID1_DID | SM_PASID1_PWSNP | 
SM_PASID1_PGSNP;
                 /* FSPTPTR, FSPM, EAFE, WPE, SRE */
                 ue->val[2] |= SM_PASID2_SRE | SM_PASID2_WPE | 
SM_PASID2_EAFE |
                                 SM_PASID2_FSPM | SM_PASID2_FSPTPTR;
                 break;
         case PASID_ENTRY_PGTT_SL_ONLY:
                 /* FPD, AW, PGTT, SSADE, SSPTPTR */
                 ue->val[0] |= SM_PASID0_FPD | SM_PASID0_AW | 
SM_PASID0_PGTT |
                                 SM_PASID0_SSADE | SM_PASID0_SSPTPTR;
                 /* PGSNP, DID, PWSNP */
                 ue->val[1] |= SM_PASID1_DID | SM_PASID1_PWSNP | 
SM_PASID1_PGSNP;
                 break;
         case PASID_ENTRY_PGTT_PT:
                 /* FPD, AW, PGTT */
                 ue->val[0] |= SM_PASID0_FPD | SM_PASID0_AW | 
SM_PASID0_PGTT;
                 /* PGSNP, DID, PWSNP */
                 ue->val[1] |= SM_PASID1_DID | SM_PASID1_PWSNP | 
SM_PASID1_PGSNP;
                 break;
         default:
                 WARN_ON(true);
         }

         trace_entry_get_used(entry, used);
}

Thanks,
baolu
Re: [PATCH 2/8] iommu/vt-d: Add entry_sync support for PASID entry updates
Posted by Jason Gunthorpe 4 weeks ago
On Wed, Mar 11, 2026 at 04:42:37PM +0800, Baolu Lu wrote:
>         switch (pgtt) {
>         case PASID_ENTRY_PGTT_FL_ONLY:
>                 /* AW, PGTT */
>                 ue->val[0] |= SM_PASID0_AW | SM_PASID0_PGTT;

Probably don't need the comments too :)

Jason
Re: [PATCH 2/8] iommu/vt-d: Add entry_sync support for PASID entry updates
Posted by Baolu Lu 4 weeks ago
On 3/11/26 20:23, Jason Gunthorpe wrote:
> On Wed, Mar 11, 2026 at 04:42:37PM +0800, Baolu Lu wrote:
>>          switch (pgtt) {
>>          case PASID_ENTRY_PGTT_FL_ONLY:
>>                  /* AW, PGTT */
>>                  ue->val[0] |= SM_PASID0_AW | SM_PASID0_PGTT;
> Probably don't need the comments too 🙂

Yeah, the code itself is self-explained.

Thanks,
baolu