iommu/vt-d: Hitless PASID updates via entry_sync

[PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Lu Baolu 1 month ago

From: Jason Gunthorpe <jgg@nvidia.com>

Many IOMMU implementations store data structures in host memory that can
be quite big. The iommu is able to DMA read the host memory using an
atomic quanta, usually 64 or 128 bits, and will read an entry using
multiple quanta reads.

Updating the host memory datastructure entry while the HW is concurrently
DMA'ing it is a little bit involved, but if you want to do this hitlessly,
while never making the entry non-valid, then it becomes quite complicated.

entry_sync is a library to handle this task. It works on the notion of
"used bits" which reflect which bits the HW is actually sensitive to and
which bits are ignored by hardware. Many hardware specifications say
things like 'if mode is X then bits ABC are ignored'.

Using the ignored bits entry_sync can often compute a series of ordered
writes and flushes that will allow the entry to be updated while keeping
it valid. If such an update is not possible then entry will be made
temporarily non-valid.

A 64 and 128 bit quanta version is provided to support existing iommus.

Co-developed-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
---
 drivers/iommu/Kconfig               |  14 +++
 drivers/iommu/Makefile              |   1 +
 drivers/iommu/entry_sync.h          |  66 +++++++++++++
 drivers/iommu/entry_sync_template.h | 143 ++++++++++++++++++++++++++++
 drivers/iommu/entry_sync.c          |  68 +++++++++++++
 5 files changed, 292 insertions(+)
 create mode 100644 drivers/iommu/entry_sync.h
 create mode 100644 drivers/iommu/entry_sync_template.h
 create mode 100644 drivers/iommu/entry_sync.c

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index f86262b11416..2650c9fa125b 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -145,6 +145,20 @@ config IOMMU_DEFAULT_PASSTHROUGH
 
 endchoice
 
+config IOMMU_ENTRY_SYNC
+	bool
+	default n
+
+config IOMMU_ENTRY_SYNC64
+	bool
+	select IOMMU_ENTRY_SYNC
+	default n
+
+config IOMMU_ENTRY_SYNC128
+	bool
+	select IOMMU_ENTRY_SYNC
+	default n
+
 config OF_IOMMU
 	def_bool y
 	depends on OF && IOMMU_API
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 0275821f4ef9..bd923995497a 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -10,6 +10,7 @@ obj-$(CONFIG_IOMMU_API) += iommu-traces.o
 obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
 obj-$(CONFIG_IOMMU_DEBUGFS) += iommu-debugfs.o
 obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
+obj-$(CONFIG_IOMMU_ENTRY_SYNC) += entry_sync.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
 obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
diff --git a/drivers/iommu/entry_sync.h b/drivers/iommu/entry_sync.h
new file mode 100644
index 000000000000..004d421c71c0
--- /dev/null
+++ b/drivers/iommu/entry_sync.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Many IOMMU implementations store data structures in host memory that can be
+ * quite big. The iommu is able to DMA read the host memory using an atomic
+ * quanta, usually 64 or 128 bits, and will read an entry using multiple quanta
+ * reads.
+ *
+ * Updating the host memory datastructure entry while the HW is concurrently
+ * DMA'ing it is a little bit involved, but if you want to do this hitlessly,
+ * while never making the entry non-valid, then it becomes quite complicated.
+ *
+ * entry_sync is a library to handle this task. It works on the notion of "used
+ * bits" which reflect which bits the HW is actually sensitive to and which bits
+ * are ignored by hardware. Many hardware specifications say things like 'if
+ * mode is X then bits ABC are ignored'.
+ *
+ * Using the ignored bits entry_sync can often compute a series of ordered
+ * writes and flushes that will allow the entry to be updated while keeping it
+ * valid. If such an update is not possible then entry will be made temporarily
+ * non-valid.
+ *
+ * A 64 and 128 bit quanta version is provided to support existing iommus.
+ */
+#ifndef IOMMU_ENTRY_SYNC_H
+#define IOMMU_ENTRY_SYNC_H
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <linux/bug.h>
+
+/* Caller allocates a stack array of this length to call entry_sync_write() */
+#define ENTRY_SYNC_MEMORY_LEN(writer) ((writer)->num_quantas * 3)
+
+struct entry_sync_writer_ops64;
+struct entry_sync_writer64 {
+	const struct entry_sync_writer_ops64 *ops;
+	size_t num_quantas;
+	size_t vbit_quanta;
+};
+
+struct entry_sync_writer_ops64 {
+	void (*get_used)(const __le64 *entry, __le64 *used);
+	void (*sync)(struct entry_sync_writer64 *writer);
+};
+
+void entry_sync_write64(struct entry_sync_writer64 *writer, __le64 *entry,
+			const __le64 *target, __le64 *memory,
+			size_t memory_len);
+
+struct entry_sync_writer_ops128;
+struct entry_sync_writer128 {
+	const struct entry_sync_writer_ops128 *ops;
+	size_t num_quantas;
+	size_t vbit_quanta;
+};
+
+struct entry_sync_writer_ops128 {
+	void (*get_used)(const u128 *entry, u128 *used);
+	void (*sync)(struct entry_sync_writer128 *writer);
+};
+
+void entry_sync_write128(struct entry_sync_writer128 *writer, u128 *entry,
+			 const u128 *target, u128 *memory,
+			 size_t memory_len);
+
+#endif
diff --git a/drivers/iommu/entry_sync_template.h b/drivers/iommu/entry_sync_template.h
new file mode 100644
index 000000000000..646f518b098e
--- /dev/null
+++ b/drivers/iommu/entry_sync_template.h
@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#include "entry_sync.h"
+#include <linux/args.h>
+#include <linux/bitops.h>
+
+#ifndef entry_sync_writer
+#define entry_sync_writer entry_sync_writer64
+#define quanta_t __le64
+#define NS(name) CONCATENATE(name, 64)
+#endif
+
+/*
+ * Figure out if we can do a hitless update of entry to become target. Returns a
+ * bit mask where 1 indicates that a quanta word needs to be set disruptively.
+ * unused_update is an intermediate value of entry that has unused bits set to
+ * their new values.
+ */
+static u8 NS(entry_quanta_diff)(struct entry_sync_writer *writer,
+				const quanta_t *entry, const quanta_t *target,
+				quanta_t *unused_update, quanta_t *memory)
+{
+	quanta_t *target_used = memory + writer->num_quantas * 1;
+	quanta_t *cur_used = memory + writer->num_quantas * 2;
+	u8 used_qword_diff = 0;
+	unsigned int i;
+
+	writer->ops->get_used(entry, cur_used);
+	writer->ops->get_used(target, target_used);
+
+	for (i = 0; i != writer->num_quantas; i++) {
+		/*
+		 * Check that masks are up to date, the make functions are not
+		 * allowed to set a bit to 1 if the used function doesn't say it
+		 * is used.
+		 */
+		WARN_ON_ONCE(target[i] & ~target_used[i]);
+
+		/* Bits can change because they are not currently being used */
+		unused_update[i] = (entry[i] & cur_used[i]) |
+				   (target[i] & ~cur_used[i]);
+		/*
+		 * Each bit indicates that a used bit in a qword needs to be
+		 * changed after unused_update is applied.
+		 */
+		if ((unused_update[i] & target_used[i]) != target[i])
+			used_qword_diff |= 1 << i;
+	}
+	return used_qword_diff;
+}
+
+/*
+ * Update the entry to the target configuration. The transition from the current
+ * entry to the target entry takes place over multiple steps that attempts to
+ * make the transition hitless if possible. This function takes care not to
+ * create a situation where the HW can perceive a corrupted entry. HW is only
+ * required to have a quanta-bit atomicity with stores from the CPU, while
+ * entries are many quanta bit values big.
+ *
+ * The difference between the current value and the target value is analyzed to
+ * determine which of three updates are required - disruptive, hitless or no
+ * change.
+ *
+ * In the most general disruptive case we can make any update in three steps:
+ *  - Disrupting the entry (V=0)
+ *  - Fill now unused quanta words, except qword 0 which contains V
+ *  - Make qword 0 have the final value and valid (V=1) with a single 64
+ *    bit store
+ *
+ * However this disrupts the HW while it is happening. There are several
+ * interesting cases where a STE/CD can be updated without disturbing the HW
+ * because only a small number of bits are changing (S1DSS, CONFIG, etc) or
+ * because the used bits don't intersect. We can detect this by calculating how
+ * many 64 bit values need update after adjusting the unused bits and skip the
+ * V=0 process. This relies on the IGNORED behavior described in the
+ * specification.
+ */
+void NS(entry_sync_write)(struct entry_sync_writer *writer, quanta_t *entry,
+			  const quanta_t *target, quanta_t *memory,
+			  size_t memory_len)
+{
+	quanta_t *unused_update = memory + writer->num_quantas * 0;
+	u8 used_qword_diff;
+
+	if (WARN_ON(memory_len !=
+		    ENTRY_SYNC_MEMORY_LEN(writer) * sizeof(*memory)))
+		return;
+
+	used_qword_diff = NS(entry_quanta_diff)(writer, entry, target,
+						unused_update, memory);
+	if (hweight8(used_qword_diff) == 1) {
+		/*
+		 * Only one quanta needs its used bits to be changed. This is a
+		 * hitless update, update all bits the current entry is ignoring
+		 * to their new values, then update a single "critical quanta"
+		 * to change the entry and finally 0 out any bits that are now
+		 * unused in the target configuration.
+		 */
+		unsigned int critical_qword_index = ffs(used_qword_diff) - 1;
+
+		/*
+		 * Skip writing unused bits in the critical quanta since we'll
+		 * be writing it in the next step anyways. This can save a sync
+		 * when the only change is in that quanta.
+		 */
+		unused_update[critical_qword_index] =
+			entry[critical_qword_index];
+		NS(entry_set)(writer, entry, unused_update, 0,
+			      writer->num_quantas);
+		NS(entry_set)(writer, entry, target, critical_qword_index, 1);
+		NS(entry_set)(writer, entry, target, 0, writer->num_quantas);
+	} else if (used_qword_diff) {
+		/*
+		 * At least two quantas need their inuse bits to be changed.
+		 * This requires a breaking update, zero the V bit, write all
+		 * qwords but 0, then set qword 0
+		 */
+		unused_update[writer->vbit_quanta] = 0;
+		NS(entry_set)(writer, entry, unused_update, writer->vbit_quanta, 1);
+
+		if (writer->vbit_quanta != 0)
+			NS(entry_set)(writer, entry, target, 0,
+				      writer->vbit_quanta - 1);
+		if (writer->vbit_quanta != writer->num_quantas)
+			NS(entry_set)(writer, entry, target,
+				      writer->vbit_quanta,
+				      writer->num_quantas - 1);
+
+		NS(entry_set)(writer, entry, target, writer->vbit_quanta, 1);
+	} else {
+		/*
+		 * No inuse bit changed. Sanity check that all unused bits are 0
+		 * in the entry. The target was already sanity checked by
+		 * entry_quanta_diff().
+		 */
+		WARN_ON_ONCE(NS(entry_set)(writer, entry, target, 0,
+					   writer->num_quantas));
+	}
+}
+EXPORT_SYMBOL(NS(entry_sync_write));
+
+#undef entry_sync_writer
+#undef quanta_t
+#undef NS
diff --git a/drivers/iommu/entry_sync.c b/drivers/iommu/entry_sync.c
new file mode 100644
index 000000000000..48d31270dbba
--- /dev/null
+++ b/drivers/iommu/entry_sync.c
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Helpers for drivers to update multi-quanta entries shared with HW without
+ * races to minimize breaking changes.
+ */
+#include "entry_sync.h"
+#include <linux/kconfig.h>
+#include <linux/atomic.h>
+
+#if IS_ENABLED(CONFIG_IOMMU_ENTRY_SYNC64)
+static bool entry_set64(struct entry_sync_writer64 *writer, __le64 *entry,
+			const __le64 *target, unsigned int start,
+			unsigned int len)
+{
+	bool changed = false;
+	unsigned int i;
+
+	for (i = start; len != 0; len--, i++) {
+		if (entry[i] != target[i]) {
+			WRITE_ONCE(entry[i], target[i]);
+			changed = true;
+		}
+	}
+
+	if (changed)
+		writer->ops->sync(writer);
+	return changed;
+}
+
+#define entry_sync_writer entry_sync_writer64
+#define quanta_t __le64
+#define NS(name) CONCATENATE(name, 64)
+#include "entry_sync_template.h"
+#endif
+
+#if IS_ENABLED(CONFIG_IOMMU_ENTRY_SYNC128)
+static bool entry_set128(struct entry_sync_writer128 *writer, u128 *entry,
+			 const u128 *target, unsigned int start,
+			 unsigned int len)
+{
+	bool changed = false;
+	unsigned int i;
+
+	for (i = start; len != 0; len--, i++) {
+		if (entry[i] != target[i]) {
+			/*
+			 * Use cmpxchg128 to generate an indivisible write from
+			 * the CPU to DMA'able memory. This must ensure that HW
+			 * sees either the new or old 128 bit value and not
+			 * something torn. As updates are serialized by a
+			 * spinlock, we use the local (unlocked) variant to
+			 * avoid unnecessary bus locking overhead.
+			 */
+			cmpxchg128_local(&entry[i], entry[i], target[i]);
+			changed = true;
+		}
+	}
+
+	if (changed)
+		writer->ops->sync(writer);
+	return changed;
+}
+
+#define entry_sync_writer entry_sync_writer128
+#define quanta_t u128
+#define NS(name) CONCATENATE(name, 128)
+#include "entry_sync_template.h"
+#endif
-- 
2.43.0

Re: [PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Nicolin Chen 3 weeks, 6 days ago

Hi Baolu,

On Mon, Mar 09, 2026 at 02:06:41PM +0800, Lu Baolu wrote:
> +struct entry_sync_writer_ops64;
> +struct entry_sync_writer64 {
> +	const struct entry_sync_writer_ops64 *ops;
> +	size_t num_quantas;
> +	size_t vbit_quanta;
> +};

Though I could guess what the @num_quantas and @vbit_quanta likely
mean, it'd be nicer to have some notes elaborating them.

> +/*
> + * Figure out if we can do a hitless update of entry to become target. Returns a
> + * bit mask where 1 indicates that a quanta word needs to be set disruptively.
> + * unused_update is an intermediate value of entry that has unused bits set to
> + * their new values.
> + */
> +static u8 NS(entry_quanta_diff)(struct entry_sync_writer *writer,
> +				const quanta_t *entry, const quanta_t *target,
> +				quanta_t *unused_update, quanta_t *memory)
> +{
> +	quanta_t *target_used = memory + writer->num_quantas * 1;
> +	quanta_t *cur_used = memory + writer->num_quantas * 2;

Should we have a kdoc somewhere mentioning that the two arrays are
neighbors (IIUIC)?

> +	u8 used_qword_diff = 0;

It seems to me that we want use "quanta" v.s. "qword"? 128 bits can
be called "dqword" as well though.

> +	unsigned int i;
> +
> +	writer->ops->get_used(entry, cur_used);
> +	writer->ops->get_used(target, target_used);

SMMU has get_update_safe now. Can we take it together?

> +void NS(entry_sync_write)(struct entry_sync_writer *writer, quanta_t *entry,
> +			  const quanta_t *target, quanta_t *memory,
> +			  size_t memory_len)
> +{
> +	quanta_t *unused_update = memory + writer->num_quantas * 0;
> +	u8 used_qword_diff;
> +
> +	if (WARN_ON(memory_len !=
> +		    ENTRY_SYNC_MEMORY_LEN(writer) * sizeof(*memory)))
> +		return;
> +
> +	used_qword_diff = NS(entry_quanta_diff)(writer, entry, target,
> +						unused_update, memory);
> +	if (hweight8(used_qword_diff) == 1) {
> +		/*
> +		 * Only one quanta needs its used bits to be changed. This is a
> +		 * hitless update, update all bits the current entry is ignoring
> +		 * to their new values, then update a single "critical quanta"
> +		 * to change the entry and finally 0 out any bits that are now
> +		 * unused in the target configuration.
> +		 */
> +		unsigned int critical_qword_index = ffs(used_qword_diff) - 1;
> +
> +		/*
> +		 * Skip writing unused bits in the critical quanta since we'll
> +		 * be writing it in the next step anyways. This can save a sync
> +		 * when the only change is in that quanta.
> +		 */
> +		unused_update[critical_qword_index] =
> +			entry[critical_qword_index];
> +		NS(entry_set)(writer, entry, unused_update, 0,
> +			      writer->num_quantas);
> +		NS(entry_set)(writer, entry, target, critical_qword_index, 1);
> +		NS(entry_set)(writer, entry, target, 0, writer->num_quantas);
> +	} else if (used_qword_diff) {
> +		/*
> +		 * At least two quantas need their inuse bits to be changed.
> +		 * This requires a breaking update, zero the V bit, write all
> +		 * qwords but 0, then set qword 0
> +		 */

Still, it'd be nicer to unify the wording between "quanta" and
"qword".

[..]
> +EXPORT_SYMBOL(NS(entry_sync_write));

There is also a KUNIT test coverage in arm-smmu-v3 for all of these
functions. Maybe we can make that generic as well?

> +#define entry_sync_writer entry_sync_writer64
> +#define quanta_t __le64
[..]
> +#define entry_sync_writer entry_sync_writer128
> +#define quanta_t u128

u64 can be called 64 too, though we might not have use case for now.

But maybe we could just call them:
    entry_sync_writer_le64
    entry_sync_writer_u128
?

Nicolin

Re: [PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Baolu Lu 3 weeks, 3 days ago

On 3/13/26 13:39, Nicolin Chen wrote:
> Hi Baolu,

Hi Nicolin,

Thanks for the comments.

> 
> On Mon, Mar 09, 2026 at 02:06:41PM +0800, Lu Baolu wrote:
>> +struct entry_sync_writer_ops64;
>> +struct entry_sync_writer64 {
>> +	const struct entry_sync_writer_ops64 *ops;
>> +	size_t num_quantas;
>> +	size_t vbit_quanta;
>> +};
> 
> Though I could guess what the @num_quantas and @vbit_quanta likely
> mean, it'd be nicer to have some notes elaborating them.

Yes. I will make it like this,

struct entry_sync_writer64 {
         const struct entry_sync_writer_ops64 *ops;
         /* Total size of the entry in atomic units: */
         size_t num_quantas;
         /* The index of the quanta containing the Valid bit: */
         size_t vbit_quanta;
};

The same to entry_sync_writer128.

> 
>> +/*
>> + * Figure out if we can do a hitless update of entry to become target. Returns a
>> + * bit mask where 1 indicates that a quanta word needs to be set disruptively.
>> + * unused_update is an intermediate value of entry that has unused bits set to
>> + * their new values.
>> + */
>> +static u8 NS(entry_quanta_diff)(struct entry_sync_writer *writer,
>> +				const quanta_t *entry, const quanta_t *target,
>> +				quanta_t *unused_update, quanta_t *memory)
>> +{
>> +	quanta_t *target_used = memory + writer->num_quantas * 1;
>> +	quanta_t *cur_used = memory + writer->num_quantas * 2;
> 
> Should we have a kdoc somewhere mentioning that the two arrays are
> neighbors (IIUIC)?

The library uses a single block of scratchpad memory and offsets into 
it. A WARN_ON() is added in NS(entry_sync_write) to ensure this:

         if (WARN_ON(memory_len !=
                     ENTRY_SYNC_MEMORY_LEN(writer) * sizeof(*memory)))
                 return;

How about adding below comments around this WARN_ON()?

/*
  * The scratchpad memory is organized into three neighbors:
  * 1. [0, num_quantas): 'unused_update' - intermediate state with
  *    ignored bits updated.
  * 2. [num_quantas, 2*num_quantas): 'target_used' - bits active in
  *    the target state.
  * 3. [2*num_quantas, 3*num_quantas): 'cur_used' - bits active in
  *    the current state.
  */

>> +	u8 used_qword_diff = 0;
> 
> It seems to me that we want use "quanta" v.s. "qword"? 128 bits can
> be called "dqword" as well though.

Yes. "qword" is a bit too x86-centric. Since the library is designed
around the concept of an atomic "quanta" of update, I will unify the
terminology ("quanta" in general) and use used_quanta_diff

> 
>> +	unsigned int i;
>> +
>> +	writer->ops->get_used(entry, cur_used);
>> +	writer->ops->get_used(target, target_used);
> 
> SMMU has get_update_safe now. Can we take it together?

I will look into the SMMUv3 get_update_safe implementation. Or integrate
that specially when we transition the ARM SMMUv3 driver to use this
generic entry_sync library.

> 
>> +void NS(entry_sync_write)(struct entry_sync_writer *writer, quanta_t *entry,
>> +			  const quanta_t *target, quanta_t *memory,
>> +			  size_t memory_len)
>> +{
>> +	quanta_t *unused_update = memory + writer->num_quantas * 0;
>> +	u8 used_qword_diff;
>> +
>> +	if (WARN_ON(memory_len !=
>> +		    ENTRY_SYNC_MEMORY_LEN(writer) * sizeof(*memory)))
>> +		return;
>> +
>> +	used_qword_diff = NS(entry_quanta_diff)(writer, entry, target,
>> +						unused_update, memory);
>> +	if (hweight8(used_qword_diff) == 1) {
>> +		/*
>> +		 * Only one quanta needs its used bits to be changed. This is a
>> +		 * hitless update, update all bits the current entry is ignoring
>> +		 * to their new values, then update a single "critical quanta"
>> +		 * to change the entry and finally 0 out any bits that are now
>> +		 * unused in the target configuration.
>> +		 */
>> +		unsigned int critical_qword_index = ffs(used_qword_diff) - 1;
>> +
>> +		/*
>> +		 * Skip writing unused bits in the critical quanta since we'll
>> +		 * be writing it in the next step anyways. This can save a sync
>> +		 * when the only change is in that quanta.
>> +		 */
>> +		unused_update[critical_qword_index] =
>> +			entry[critical_qword_index];
>> +		NS(entry_set)(writer, entry, unused_update, 0,
>> +			      writer->num_quantas);
>> +		NS(entry_set)(writer, entry, target, critical_qword_index, 1);
>> +		NS(entry_set)(writer, entry, target, 0, writer->num_quantas);
>> +	} else if (used_qword_diff) {
>> +		/*
>> +		 * At least two quantas need their inuse bits to be changed.
>> +		 * This requires a breaking update, zero the V bit, write all
>> +		 * qwords but 0, then set qword 0
>> +		 */
> 
> Still, it'd be nicer to unify the wording between "quanta" and
> "qword".

Yes.

> 
> [..]
>> +EXPORT_SYMBOL(NS(entry_sync_write));
> 
> There is also a KUNIT test coverage in arm-smmu-v3 for all of these
> functions. Maybe we can make that generic as well?

Same here.

> 
>> +#define entry_sync_writer entry_sync_writer64
>> +#define quanta_t __le64
> [..]
>> +#define entry_sync_writer entry_sync_writer128
>> +#define quanta_t u128
> 
> u64 can be called 64 too, though we might not have use case for now.
> 
> But maybe we could just call them:
>      entry_sync_writer_le64
>      entry_sync_writer_u128
> ?
I'm fine with the new naming. It is more explicit. I will update the
names unless there are further objections.

Thanks,
baolu

Re: [PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Jason Gunthorpe 2 weeks, 2 days ago

On Mon, Mar 16, 2026 at 02:24:57PM +0800, Baolu Lu wrote:

> > > +	writer->ops->get_used(entry, cur_used);
> > > +	writer->ops->get_used(target, target_used);
> > 
> > SMMU has get_update_safe now. Can we take it together?
> 
> I will look into the SMMUv3 get_update_safe implementation. Or integrate
> that specially when we transition the ARM SMMUv3 driver to use this
> generic entry_sync library.

The intention was to copy the existing ARM code as is, the draft I
sent was before these changes from Nicolin, so it should get updated..

> > > +EXPORT_SYMBOL(NS(entry_sync_write));
> > 
> > There is also a KUNIT test coverage in arm-smmu-v3 for all of these
> > functions. Maybe we can make that generic as well?
> 
> Same here.

That will be a bit hard since it depends on driver functions.

> > But maybe we could just call them:
> >      entry_sync_writer_le64
> >      entry_sync_writer_u128
> > ?
> I'm fine with the new naming. It is more explicit. I will update the
> names unless there are further objections.

I was wondering if we should just be using void * here as the type
safety seems a bit harmful if the goal is to make the 128 bit option
fall back to 64 bits if not supported.

The maximum supported HW atomic quanta can be passed in through the
struct.

Jason

Re: [PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Baolu Lu 2 weeks, 2 days ago

On 3/23/26 20:59, Jason Gunthorpe wrote:
> On Mon, Mar 16, 2026 at 02:24:57PM +0800, Baolu Lu wrote:
> 
>>>> +	writer->ops->get_used(entry, cur_used);
>>>> +	writer->ops->get_used(target, target_used);
>>>
>>> SMMU has get_update_safe now. Can we take it together?
>>
>> I will look into the SMMUv3 get_update_safe implementation. Or integrate
>> that specially when we transition the ARM SMMUv3 driver to use this
>> generic entry_sync library.
> 
> The intention was to copy the existing ARM code as is, the draft I
> sent was before these changes from Nicolin, so it should get updated..

Okay.

> 
>>>> +EXPORT_SYMBOL(NS(entry_sync_write));
>>>
>>> There is also a KUNIT test coverage in arm-smmu-v3 for all of these
>>> functions. Maybe we can make that generic as well?
>>
>> Same here.
> 
> That will be a bit hard since it depends on driver functions.
> 
>>> But maybe we could just call them:
>>>       entry_sync_writer_le64
>>>       entry_sync_writer_u128
>>> ?
>> I'm fine with the new naming. It is more explicit. I will update the
>> names unless there are further objections.
> 
> I was wondering if we should just be using void * here as the type
> safety seems a bit harmful if the goal is to make the 128 bit option
> fall back to 64 bits if not supported.
> 
> The maximum supported HW atomic quanta can be passed in through the
> struct.

I will explore refactoring the library to use void * and a dynamic
quanta size for v2.

Thanks,
baolu

Re: [PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Samiullah Khawaja 1 month ago

On Mon, Mar 09, 2026 at 02:06:41PM +0800, Lu Baolu wrote:
>From: Jason Gunthorpe <jgg@nvidia.com>
>
>Many IOMMU implementations store data structures in host memory that can
>be quite big. The iommu is able to DMA read the host memory using an
>atomic quanta, usually 64 or 128 bits, and will read an entry using
>multiple quanta reads.
>
>Updating the host memory datastructure entry while the HW is concurrently
>DMA'ing it is a little bit involved, but if you want to do this hitlessly,
>while never making the entry non-valid, then it becomes quite complicated.
>
>entry_sync is a library to handle this task. It works on the notion of
>"used bits" which reflect which bits the HW is actually sensitive to and
>which bits are ignored by hardware. Many hardware specifications say
>things like 'if mode is X then bits ABC are ignored'.
>
>Using the ignored bits entry_sync can often compute a series of ordered
>writes and flushes that will allow the entry to be updated while keeping
>it valid. If such an update is not possible then entry will be made
>temporarily non-valid.
>
>A 64 and 128 bit quanta version is provided to support existing iommus.
>
>Co-developed-by: Lu Baolu <baolu.lu@linux.intel.com>
>Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
>Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
>---
> drivers/iommu/Kconfig               |  14 +++
> drivers/iommu/Makefile              |   1 +
> drivers/iommu/entry_sync.h          |  66 +++++++++++++
> drivers/iommu/entry_sync_template.h | 143 ++++++++++++++++++++++++++++
> drivers/iommu/entry_sync.c          |  68 +++++++++++++
> 5 files changed, 292 insertions(+)
> create mode 100644 drivers/iommu/entry_sync.h
> create mode 100644 drivers/iommu/entry_sync_template.h
> create mode 100644 drivers/iommu/entry_sync.c
>
>diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>index f86262b11416..2650c9fa125b 100644
>--- a/drivers/iommu/Kconfig
>+++ b/drivers/iommu/Kconfig
>@@ -145,6 +145,20 @@ config IOMMU_DEFAULT_PASSTHROUGH
>
> endchoice
>
>+config IOMMU_ENTRY_SYNC
>+	bool
>+	default n
>+
>+config IOMMU_ENTRY_SYNC64
>+	bool
>+	select IOMMU_ENTRY_SYNC
>+	default n
>+
>+config IOMMU_ENTRY_SYNC128
>+	bool
>+	select IOMMU_ENTRY_SYNC
>+	default n
>+
> config OF_IOMMU
> 	def_bool y
> 	depends on OF && IOMMU_API
>diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
>index 0275821f4ef9..bd923995497a 100644
>--- a/drivers/iommu/Makefile
>+++ b/drivers/iommu/Makefile
>@@ -10,6 +10,7 @@ obj-$(CONFIG_IOMMU_API) += iommu-traces.o
> obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
> obj-$(CONFIG_IOMMU_DEBUGFS) += iommu-debugfs.o
> obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
>+obj-$(CONFIG_IOMMU_ENTRY_SYNC) += entry_sync.o
> obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
> obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
> obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
>diff --git a/drivers/iommu/entry_sync.h b/drivers/iommu/entry_sync.h
>new file mode 100644
>index 000000000000..004d421c71c0
>--- /dev/null
>+++ b/drivers/iommu/entry_sync.h
>@@ -0,0 +1,66 @@
>+/* SPDX-License-Identifier: GPL-2.0-only */
>+/*
>+ * Many IOMMU implementations store data structures in host memory that can be
>+ * quite big. The iommu is able to DMA read the host memory using an atomic
>+ * quanta, usually 64 or 128 bits, and will read an entry using multiple quanta
>+ * reads.
>+ *
>+ * Updating the host memory datastructure entry while the HW is concurrently
>+ * DMA'ing it is a little bit involved, but if you want to do this hitlessly,
>+ * while never making the entry non-valid, then it becomes quite complicated.
>+ *
>+ * entry_sync is a library to handle this task. It works on the notion of "used
>+ * bits" which reflect which bits the HW is actually sensitive to and which bits
>+ * are ignored by hardware. Many hardware specifications say things like 'if
>+ * mode is X then bits ABC are ignored'.
>+ *
>+ * Using the ignored bits entry_sync can often compute a series of ordered
>+ * writes and flushes that will allow the entry to be updated while keeping it
>+ * valid. If such an update is not possible then entry will be made temporarily
>+ * non-valid.
>+ *
>+ * A 64 and 128 bit quanta version is provided to support existing iommus.
>+ */
>+#ifndef IOMMU_ENTRY_SYNC_H
>+#define IOMMU_ENTRY_SYNC_H
>+
>+#include <linux/types.h>
>+#include <linux/compiler.h>
>+#include <linux/bug.h>
>+
>+/* Caller allocates a stack array of this length to call entry_sync_write() */
>+#define ENTRY_SYNC_MEMORY_LEN(writer) ((writer)->num_quantas * 3)
>+
>+struct entry_sync_writer_ops64;
>+struct entry_sync_writer64 {
>+	const struct entry_sync_writer_ops64 *ops;
>+	size_t num_quantas;
>+	size_t vbit_quanta;
>+};
>+
>+struct entry_sync_writer_ops64 {
>+	void (*get_used)(const __le64 *entry, __le64 *used);
>+	void (*sync)(struct entry_sync_writer64 *writer);
>+};
>+
>+void entry_sync_write64(struct entry_sync_writer64 *writer, __le64 *entry,
>+			const __le64 *target, __le64 *memory,
>+			size_t memory_len);
>+
>+struct entry_sync_writer_ops128;
>+struct entry_sync_writer128 {
>+	const struct entry_sync_writer_ops128 *ops;
>+	size_t num_quantas;
>+	size_t vbit_quanta;
>+};
>+
>+struct entry_sync_writer_ops128 {
>+	void (*get_used)(const u128 *entry, u128 *used);
>+	void (*sync)(struct entry_sync_writer128 *writer);
>+};
>+
>+void entry_sync_write128(struct entry_sync_writer128 *writer, u128 *entry,
>+			 const u128 *target, u128 *memory,
>+			 size_t memory_len);
>+
>+#endif
>diff --git a/drivers/iommu/entry_sync_template.h b/drivers/iommu/entry_sync_template.h
>new file mode 100644
>index 000000000000..646f518b098e
>--- /dev/null
>+++ b/drivers/iommu/entry_sync_template.h
>@@ -0,0 +1,143 @@
>+/* SPDX-License-Identifier: GPL-2.0-only */
>+#include "entry_sync.h"
>+#include <linux/args.h>
>+#include <linux/bitops.h>
>+
>+#ifndef entry_sync_writer
>+#define entry_sync_writer entry_sync_writer64
>+#define quanta_t __le64
>+#define NS(name) CONCATENATE(name, 64)
>+#endif
>+
>+/*
>+ * Figure out if we can do a hitless update of entry to become target. Returns a
>+ * bit mask where 1 indicates that a quanta word needs to be set disruptively.
>+ * unused_update is an intermediate value of entry that has unused bits set to
>+ * their new values.
>+ */
>+static u8 NS(entry_quanta_diff)(struct entry_sync_writer *writer,
>+				const quanta_t *entry, const quanta_t *target,
>+				quanta_t *unused_update, quanta_t *memory)
>+{
>+	quanta_t *target_used = memory + writer->num_quantas * 1;
>+	quanta_t *cur_used = memory + writer->num_quantas * 2;
>+	u8 used_qword_diff = 0;
>+	unsigned int i;
>+
>+	writer->ops->get_used(entry, cur_used);
>+	writer->ops->get_used(target, target_used);
>+
>+	for (i = 0; i != writer->num_quantas; i++) {
>+		/*
>+		 * Check that masks are up to date, the make functions are not

nit: "the make functions" looks like a typo.
>+		 * allowed to set a bit to 1 if the used function doesn't say it
>+		 * is used.
>+		 */
>+		WARN_ON_ONCE(target[i] & ~target_used[i]);
>+
>+		/* Bits can change because they are not currently being used */
>+		unused_update[i] = (entry[i] & cur_used[i]) |
>+				   (target[i] & ~cur_used[i]);
>+		/*
>+		 * Each bit indicates that a used bit in a qword needs to be
>+		 * changed after unused_update is applied.
>+		 */
>+		if ((unused_update[i] & target_used[i]) != target[i])
>+			used_qword_diff |= 1 << i;
>+	}
>+	return used_qword_diff;
>+}
>+
>+/*
>+ * Update the entry to the target configuration. The transition from the current
>+ * entry to the target entry takes place over multiple steps that attempts to
>+ * make the transition hitless if possible. This function takes care not to
>+ * create a situation where the HW can perceive a corrupted entry. HW is only
>+ * required to have a quanta-bit atomicity with stores from the CPU, while
>+ * entries are many quanta bit values big.
>+ *
>+ * The difference between the current value and the target value is analyzed to
>+ * determine which of three updates are required - disruptive, hitless or no
>+ * change.
>+ *
>+ * In the most general disruptive case we can make any update in three steps:
>+ *  - Disrupting the entry (V=0)
>+ *  - Fill now unused quanta words, except qword 0 which contains V
>+ *  - Make qword 0 have the final value and valid (V=1) with a single 64
>+ *    bit store
>+ *
>+ * However this disrupts the HW while it is happening. There are several
>+ * interesting cases where a STE/CD can be updated without disturbing the HW
>+ * because only a small number of bits are changing (S1DSS, CONFIG, etc) or
>+ * because the used bits don't intersect. We can detect this by calculating how
>+ * many 64 bit values need update after adjusting the unused bits and skip the
>+ * V=0 process. This relies on the IGNORED behavior described in the
>+ * specification.
>+ */
>+void NS(entry_sync_write)(struct entry_sync_writer *writer, quanta_t *entry,
>+			  const quanta_t *target, quanta_t *memory,
>+			  size_t memory_len)
>+{
>+	quanta_t *unused_update = memory + writer->num_quantas * 0;
>+	u8 used_qword_diff;
>+
>+	if (WARN_ON(memory_len !=
>+		    ENTRY_SYNC_MEMORY_LEN(writer) * sizeof(*memory)))
>+		return;
>+
>+	used_qword_diff = NS(entry_quanta_diff)(writer, entry, target,
>+						unused_update, memory);
>+	if (hweight8(used_qword_diff) == 1) {
>+		/*
>+		 * Only one quanta needs its used bits to be changed. This is a
>+		 * hitless update, update all bits the current entry is ignoring
>+		 * to their new values, then update a single "critical quanta"
>+		 * to change the entry and finally 0 out any bits that are now
>+		 * unused in the target configuration.
>+		 */
>+		unsigned int critical_qword_index = ffs(used_qword_diff) - 1;
>+
>+		/*
>+		 * Skip writing unused bits in the critical quanta since we'll
>+		 * be writing it in the next step anyways. This can save a sync
>+		 * when the only change is in that quanta.
>+		 */
>+		unused_update[critical_qword_index] =
>+			entry[critical_qword_index];
>+		NS(entry_set)(writer, entry, unused_update, 0,
>+			      writer->num_quantas);
>+		NS(entry_set)(writer, entry, target, critical_qword_index, 1);
>+		NS(entry_set)(writer, entry, target, 0, writer->num_quantas);
>+	} else if (used_qword_diff) {
>+		/*
>+		 * At least two quantas need their inuse bits to be changed.
>+		 * This requires a breaking update, zero the V bit, write all
>+		 * qwords but 0, then set qword 0
>+		 */
>+		unused_update[writer->vbit_quanta] = 0;
>+		NS(entry_set)(writer, entry, unused_update, writer->vbit_quanta, 1);
>+
>+		if (writer->vbit_quanta != 0)
>+			NS(entry_set)(writer, entry, target, 0,
>+				      writer->vbit_quanta - 1);

Looking at the definition of the entry_set below, the last argument is
length. So if vbit_quanta 1 then it would write zero len. Shouldn't it
be writing quantas before the vbit_quanta?
>+		if (writer->vbit_quanta != writer->num_quantas)
>+			NS(entry_set)(writer, entry, target,
>+				      writer->vbit_quanta,
>+				      writer->num_quantas - 1);

Sami here, the last argument should not have "- 1".
>+
>+		NS(entry_set)(writer, entry, target, writer->vbit_quanta, 1);
>+	} else {
>+		/*
>+		 * No inuse bit changed. Sanity check that all unused bits are 0
>+		 * in the entry. The target was already sanity checked by
>+		 * entry_quanta_diff().
>+		 */
>+		WARN_ON_ONCE(NS(entry_set)(writer, entry, target, 0,
>+					   writer->num_quantas));
>+	}
>+}
>+EXPORT_SYMBOL(NS(entry_sync_write));
>+
>+#undef entry_sync_writer
>+#undef quanta_t
>+#undef NS
>diff --git a/drivers/iommu/entry_sync.c b/drivers/iommu/entry_sync.c
>new file mode 100644
>index 000000000000..48d31270dbba
>--- /dev/null
>+++ b/drivers/iommu/entry_sync.c
>@@ -0,0 +1,68 @@
>+// SPDX-License-Identifier: GPL-2.0-only
>+/*
>+ * Helpers for drivers to update multi-quanta entries shared with HW without
>+ * races to minimize breaking changes.
>+ */
>+#include "entry_sync.h"
>+#include <linux/kconfig.h>
>+#include <linux/atomic.h>
>+
>+#if IS_ENABLED(CONFIG_IOMMU_ENTRY_SYNC64)
>+static bool entry_set64(struct entry_sync_writer64 *writer, __le64 *entry,
>+			const __le64 *target, unsigned int start,
>+			unsigned int len)
>+{
>+	bool changed = false;
>+	unsigned int i;
>+
>+	for (i = start; len != 0; len--, i++) {
>+		if (entry[i] != target[i]) {
>+			WRITE_ONCE(entry[i], target[i]);
>+			changed = true;
>+		}
>+	}
>+
>+	if (changed)
>+		writer->ops->sync(writer);
>+	return changed;
>+}
>+
>+#define entry_sync_writer entry_sync_writer64
>+#define quanta_t __le64
>+#define NS(name) CONCATENATE(name, 64)
>+#include "entry_sync_template.h"
>+#endif
>+
>+#if IS_ENABLED(CONFIG_IOMMU_ENTRY_SYNC128)
>+static bool entry_set128(struct entry_sync_writer128 *writer, u128 *entry,
>+			 const u128 *target, unsigned int start,
>+			 unsigned int len)
>+{
>+	bool changed = false;
>+	unsigned int i;
>+
>+	for (i = start; len != 0; len--, i++) {
>+		if (entry[i] != target[i]) {
>+			/*
>+			 * Use cmpxchg128 to generate an indivisible write from
>+			 * the CPU to DMA'able memory. This must ensure that HW
>+			 * sees either the new or old 128 bit value and not
>+			 * something torn. As updates are serialized by a
>+			 * spinlock, we use the local (unlocked) variant to
>+			 * avoid unnecessary bus locking overhead.
>+			 */
>+			cmpxchg128_local(&entry[i], entry[i], target[i]);
>+			changed = true;
>+		}
>+	}
>+
>+	if (changed)
>+		writer->ops->sync(writer);
>+	return changed;
>+}
>+
>+#define entry_sync_writer entry_sync_writer128
>+#define quanta_t u128
>+#define NS(name) CONCATENATE(name, 128)
>+#include "entry_sync_template.h"
>+#endif
>-- 
>2.43.0
>

Re: [PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Jason Gunthorpe 1 week, 2 days ago

On Mon, Mar 09, 2026 at 11:33:23PM +0000, Samiullah Khawaja wrote:
> > +	for (i = 0; i != writer->num_quantas; i++) {
> > +		/*
> > +		 * Check that masks are up to date, the make functions are not
> 
> nit: "the make functions" looks like a typo.

the smmu drivers called all the functions that build STE and CD
structs 'arm_smmu_make_xxx' So they are the 'make functions'

> > +	} else if (used_qword_diff) {
> > +		/*
> > +		 * At least two quantas need their inuse bits to be changed.
> > +		 * This requires a breaking update, zero the V bit, write all
> > +		 * qwords but 0, then set qword 0
> > +		 */
> > +		unused_update[writer->vbit_quanta] = 0;
> > +		NS(entry_set)(writer, entry, unused_update, writer->vbit_quanta, 1);
> > +
> > +		if (writer->vbit_quanta != 0)
> > +			NS(entry_set)(writer, entry, target, 0,
> > +				      writer->vbit_quanta - 1);
> 
> Looking at the definition of the entry_set below, the last argument is
> length. So if vbit_quanta 1 then it would write zero len. Shouldn't it
> be writing quantas before the vbit_quanta?
> > +		if (writer->vbit_quanta != writer->num_quantas)
> > +			NS(entry_set)(writer, entry, target,
> > +				      writer->vbit_quanta,
> > +				      writer->num_quantas - 1);
> 
> Sami here, the last argument should not have "- 1".

Yeah, I probably botched this when I quickly drafted it

Jason

Re: [PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Samiullah Khawaja 1 week, 2 days ago

On Mon, Mar 30, 2026 at 10:00:24AM -0300, Jason Gunthorpe wrote:
>On Mon, Mar 09, 2026 at 11:33:23PM +0000, Samiullah Khawaja wrote:
>> > +	for (i = 0; i != writer->num_quantas; i++) {
>> > +		/*
>> > +		 * Check that masks are up to date, the make functions are not
>>
>> nit: "the make functions" looks like a typo.
>
>the smmu drivers called all the functions that build STE and CD
>structs 'arm_smmu_make_xxx' So they are the 'make functions'

Interesting... Thanks for the context.
>
>> > +	} else if (used_qword_diff) {
>> > +		/*
>> > +		 * At least two quantas need their inuse bits to be changed.
>> > +		 * This requires a breaking update, zero the V bit, write all
>> > +		 * qwords but 0, then set qword 0
>> > +		 */
>> > +		unused_update[writer->vbit_quanta] = 0;
>> > +		NS(entry_set)(writer, entry, unused_update, writer->vbit_quanta, 1);
>> > +
>> > +		if (writer->vbit_quanta != 0)
>> > +			NS(entry_set)(writer, entry, target, 0,
>> > +				      writer->vbit_quanta - 1);
>>
>> Looking at the definition of the entry_set below, the last argument is
>> length. So if vbit_quanta 1 then it would write zero len. Shouldn't it
>> be writing quantas before the vbit_quanta?
>> > +		if (writer->vbit_quanta != writer->num_quantas)
>> > +			NS(entry_set)(writer, entry, target,
>> > +				      writer->vbit_quanta,
>> > +				      writer->num_quantas - 1);
>>
>> Sami here, the last argument should not have "- 1".
>
>Yeah, I probably botched this when I quickly drafted it
>
>Jason

Thanks,
Sami

Re: [PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Samiullah Khawaja 1 month ago

On Mon, Mar 09, 2026 at 11:33:23PM +0000, Samiullah Khawaja wrote:
>On Mon, Mar 09, 2026 at 02:06:41PM +0800, Lu Baolu wrote:
>>From: Jason Gunthorpe <jgg@nvidia.com>
>>
>>Many IOMMU implementations store data structures in host memory that can
>>be quite big. The iommu is able to DMA read the host memory using an
>>atomic quanta, usually 64 or 128 bits, and will read an entry using
>>multiple quanta reads.
>>
>>Updating the host memory datastructure entry while the HW is concurrently
>>DMA'ing it is a little bit involved, but if you want to do this hitlessly,
>>while never making the entry non-valid, then it becomes quite complicated.
>>
>>entry_sync is a library to handle this task. It works on the notion of
>>"used bits" which reflect which bits the HW is actually sensitive to and
>>which bits are ignored by hardware. Many hardware specifications say
>>things like 'if mode is X then bits ABC are ignored'.
>>
>>Using the ignored bits entry_sync can often compute a series of ordered
>>writes and flushes that will allow the entry to be updated while keeping
>>it valid. If such an update is not possible then entry will be made
>>temporarily non-valid.
>>
>>A 64 and 128 bit quanta version is provided to support existing iommus.
>>
>>Co-developed-by: Lu Baolu <baolu.lu@linux.intel.com>
>>Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
>>Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
>>---
>>drivers/iommu/Kconfig               |  14 +++
>>drivers/iommu/Makefile              |   1 +
>>drivers/iommu/entry_sync.h          |  66 +++++++++++++
>>drivers/iommu/entry_sync_template.h | 143 ++++++++++++++++++++++++++++
>>drivers/iommu/entry_sync.c          |  68 +++++++++++++
>>5 files changed, 292 insertions(+)
>>create mode 100644 drivers/iommu/entry_sync.h
>>create mode 100644 drivers/iommu/entry_sync_template.h
>>create mode 100644 drivers/iommu/entry_sync.c
>>
>>diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>>index f86262b11416..2650c9fa125b 100644
>>--- a/drivers/iommu/Kconfig
>>+++ b/drivers/iommu/Kconfig
>>@@ -145,6 +145,20 @@ config IOMMU_DEFAULT_PASSTHROUGH
>>
>>endchoice
>>
>>+config IOMMU_ENTRY_SYNC
>>+	bool
>>+	default n
>>+
>>+config IOMMU_ENTRY_SYNC64
>>+	bool
>>+	select IOMMU_ENTRY_SYNC
>>+	default n
>>+
>>+config IOMMU_ENTRY_SYNC128
>>+	bool
>>+	select IOMMU_ENTRY_SYNC
>>+	default n
>>+
>>config OF_IOMMU
>>	def_bool y
>>	depends on OF && IOMMU_API
>>diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
>>index 0275821f4ef9..bd923995497a 100644
>>--- a/drivers/iommu/Makefile
>>+++ b/drivers/iommu/Makefile
>>@@ -10,6 +10,7 @@ obj-$(CONFIG_IOMMU_API) += iommu-traces.o
>>obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
>>obj-$(CONFIG_IOMMU_DEBUGFS) += iommu-debugfs.o
>>obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
>>+obj-$(CONFIG_IOMMU_ENTRY_SYNC) += entry_sync.o
>>obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
>>obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
>>obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
>>diff --git a/drivers/iommu/entry_sync.h b/drivers/iommu/entry_sync.h
>>new file mode 100644
>>index 000000000000..004d421c71c0
>>--- /dev/null
>>+++ b/drivers/iommu/entry_sync.h
>>@@ -0,0 +1,66 @@
>>+/* SPDX-License-Identifier: GPL-2.0-only */
>>+/*
>>+ * Many IOMMU implementations store data structures in host memory that can be
>>+ * quite big. The iommu is able to DMA read the host memory using an atomic
>>+ * quanta, usually 64 or 128 bits, and will read an entry using multiple quanta
>>+ * reads.
>>+ *
>>+ * Updating the host memory datastructure entry while the HW is concurrently
>>+ * DMA'ing it is a little bit involved, but if you want to do this hitlessly,
>>+ * while never making the entry non-valid, then it becomes quite complicated.
>>+ *
>>+ * entry_sync is a library to handle this task. It works on the notion of "used
>>+ * bits" which reflect which bits the HW is actually sensitive to and which bits
>>+ * are ignored by hardware. Many hardware specifications say things like 'if
>>+ * mode is X then bits ABC are ignored'.
>>+ *
>>+ * Using the ignored bits entry_sync can often compute a series of ordered
>>+ * writes and flushes that will allow the entry to be updated while keeping it
>>+ * valid. If such an update is not possible then entry will be made temporarily
>>+ * non-valid.
>>+ *
>>+ * A 64 and 128 bit quanta version is provided to support existing iommus.
>>+ */
>>+#ifndef IOMMU_ENTRY_SYNC_H
>>+#define IOMMU_ENTRY_SYNC_H
>>+
>>+#include <linux/types.h>
>>+#include <linux/compiler.h>
>>+#include <linux/bug.h>
>>+
>>+/* Caller allocates a stack array of this length to call entry_sync_write() */
>>+#define ENTRY_SYNC_MEMORY_LEN(writer) ((writer)->num_quantas * 3)
>>+
>>+struct entry_sync_writer_ops64;
>>+struct entry_sync_writer64 {
>>+	const struct entry_sync_writer_ops64 *ops;
>>+	size_t num_quantas;
>>+	size_t vbit_quanta;
>>+};
>>+
>>+struct entry_sync_writer_ops64 {
>>+	void (*get_used)(const __le64 *entry, __le64 *used);
>>+	void (*sync)(struct entry_sync_writer64 *writer);
>>+};
>>+
>>+void entry_sync_write64(struct entry_sync_writer64 *writer, __le64 *entry,
>>+			const __le64 *target, __le64 *memory,
>>+			size_t memory_len);
>>+
>>+struct entry_sync_writer_ops128;
>>+struct entry_sync_writer128 {
>>+	const struct entry_sync_writer_ops128 *ops;
>>+	size_t num_quantas;
>>+	size_t vbit_quanta;
>>+};
>>+
>>+struct entry_sync_writer_ops128 {
>>+	void (*get_used)(const u128 *entry, u128 *used);
>>+	void (*sync)(struct entry_sync_writer128 *writer);
>>+};
>>+
>>+void entry_sync_write128(struct entry_sync_writer128 *writer, u128 *entry,
>>+			 const u128 *target, u128 *memory,
>>+			 size_t memory_len);
>>+
>>+#endif
>>diff --git a/drivers/iommu/entry_sync_template.h b/drivers/iommu/entry_sync_template.h
>>new file mode 100644
>>index 000000000000..646f518b098e
>>--- /dev/null
>>+++ b/drivers/iommu/entry_sync_template.h
>>@@ -0,0 +1,143 @@
>>+/* SPDX-License-Identifier: GPL-2.0-only */
>>+#include "entry_sync.h"
>>+#include <linux/args.h>
>>+#include <linux/bitops.h>
>>+
>>+#ifndef entry_sync_writer
>>+#define entry_sync_writer entry_sync_writer64
>>+#define quanta_t __le64
>>+#define NS(name) CONCATENATE(name, 64)
>>+#endif
>>+
>>+/*
>>+ * Figure out if we can do a hitless update of entry to become target. Returns a
>>+ * bit mask where 1 indicates that a quanta word needs to be set disruptively.
>>+ * unused_update is an intermediate value of entry that has unused bits set to
>>+ * their new values.
>>+ */
>>+static u8 NS(entry_quanta_diff)(struct entry_sync_writer *writer,
>>+				const quanta_t *entry, const quanta_t *target,
>>+				quanta_t *unused_update, quanta_t *memory)
>>+{
>>+	quanta_t *target_used = memory + writer->num_quantas * 1;
>>+	quanta_t *cur_used = memory + writer->num_quantas * 2;
>>+	u8 used_qword_diff = 0;
>>+	unsigned int i;
>>+
>>+	writer->ops->get_used(entry, cur_used);
>>+	writer->ops->get_used(target, target_used);
>>+
>>+	for (i = 0; i != writer->num_quantas; i++) {
>>+		/*
>>+		 * Check that masks are up to date, the make functions are not
>
>nit: "the make functions" looks like a typo.
>>+		 * allowed to set a bit to 1 if the used function doesn't say it
>>+		 * is used.
>>+		 */
>>+		WARN_ON_ONCE(target[i] & ~target_used[i]);
>>+
>>+		/* Bits can change because they are not currently being used */
>>+		unused_update[i] = (entry[i] & cur_used[i]) |
>>+				   (target[i] & ~cur_used[i]);
>>+		/*
>>+		 * Each bit indicates that a used bit in a qword needs to be
>>+		 * changed after unused_update is applied.
>>+		 */
>>+		if ((unused_update[i] & target_used[i]) != target[i])
>>+			used_qword_diff |= 1 << i;
>>+	}
>>+	return used_qword_diff;
>>+}
>>+
>>+/*
>>+ * Update the entry to the target configuration. The transition from the current
>>+ * entry to the target entry takes place over multiple steps that attempts to
>>+ * make the transition hitless if possible. This function takes care not to
>>+ * create a situation where the HW can perceive a corrupted entry. HW is only
>>+ * required to have a quanta-bit atomicity with stores from the CPU, while
>>+ * entries are many quanta bit values big.
>>+ *
>>+ * The difference between the current value and the target value is analyzed to
>>+ * determine which of three updates are required - disruptive, hitless or no
>>+ * change.
>>+ *
>>+ * In the most general disruptive case we can make any update in three steps:
>>+ *  - Disrupting the entry (V=0)
>>+ *  - Fill now unused quanta words, except qword 0 which contains V
>>+ *  - Make qword 0 have the final value and valid (V=1) with a single 64
>>+ *    bit store
>>+ *
>>+ * However this disrupts the HW while it is happening. There are several
>>+ * interesting cases where a STE/CD can be updated without disturbing the HW
>>+ * because only a small number of bits are changing (S1DSS, CONFIG, etc) or
>>+ * because the used bits don't intersect. We can detect this by calculating how
>>+ * many 64 bit values need update after adjusting the unused bits and skip the
>>+ * V=0 process. This relies on the IGNORED behavior described in the
>>+ * specification.
>>+ */
>>+void NS(entry_sync_write)(struct entry_sync_writer *writer, quanta_t *entry,
>>+			  const quanta_t *target, quanta_t *memory,
>>+			  size_t memory_len)
>>+{
>>+	quanta_t *unused_update = memory + writer->num_quantas * 0;
>>+	u8 used_qword_diff;
>>+
>>+	if (WARN_ON(memory_len !=
>>+		    ENTRY_SYNC_MEMORY_LEN(writer) * sizeof(*memory)))
>>+		return;
>>+
>>+	used_qword_diff = NS(entry_quanta_diff)(writer, entry, target,
>>+						unused_update, memory);
>>+	if (hweight8(used_qword_diff) == 1) {
>>+		/*
>>+		 * Only one quanta needs its used bits to be changed. This is a
>>+		 * hitless update, update all bits the current entry is ignoring
>>+		 * to their new values, then update a single "critical quanta"
>>+		 * to change the entry and finally 0 out any bits that are now
>>+		 * unused in the target configuration.
>>+		 */
>>+		unsigned int critical_qword_index = ffs(used_qword_diff) - 1;
>>+
>>+		/*
>>+		 * Skip writing unused bits in the critical quanta since we'll
>>+		 * be writing it in the next step anyways. This can save a sync
>>+		 * when the only change is in that quanta.
>>+		 */
>>+		unused_update[critical_qword_index] =
>>+			entry[critical_qword_index];
>>+		NS(entry_set)(writer, entry, unused_update, 0,
>>+			      writer->num_quantas);
>>+		NS(entry_set)(writer, entry, target, critical_qword_index, 1);
>>+		NS(entry_set)(writer, entry, target, 0, writer->num_quantas);
>>+	} else if (used_qword_diff) {
>>+		/*
>>+		 * At least two quantas need their inuse bits to be changed.
>>+		 * This requires a breaking update, zero the V bit, write all
>>+		 * qwords but 0, then set qword 0
>>+		 */
>>+		unused_update[writer->vbit_quanta] = 0;
>>+		NS(entry_set)(writer, entry, unused_update, writer->vbit_quanta, 1);
>>+
>>+		if (writer->vbit_quanta != 0)
>>+			NS(entry_set)(writer, entry, target, 0,
>>+				      writer->vbit_quanta - 1);
>
>Looking at the definition of the entry_set below, the last argument is
>length. So if vbit_quanta 1 then it would write zero len. Shouldn't it
>be writing quantas before the vbit_quanta?
>>+		if (writer->vbit_quanta != writer->num_quantas)

Looking at this again, I think vbit_quanta can never be equal to
num_quanta as num_quantas is length and vbit_quanta is index?
>>+			NS(entry_set)(writer, entry, target,
>>+				      writer->vbit_quanta,

Staring from vbit_quanta will set the present bit if it is set in the
target?
>>+				      writer->num_quantas - 1);
>
>Sami here, the last argument should not have "- 1".

I meant "Same here".
>>+
>>+		NS(entry_set)(writer, entry, target, writer->vbit_quanta, 1);
>>+	} else {
>>+		/*
>>+		 * No inuse bit changed. Sanity check that all unused bits are 0
>>+		 * in the entry. The target was already sanity checked by
>>+		 * entry_quanta_diff().
>>+		 */
>>+		WARN_ON_ONCE(NS(entry_set)(writer, entry, target, 0,
>>+					   writer->num_quantas));
>>+	}
>>+}
>>+EXPORT_SYMBOL(NS(entry_sync_write));
>>+
>>+#undef entry_sync_writer
>>+#undef quanta_t
>>+#undef NS
>>diff --git a/drivers/iommu/entry_sync.c b/drivers/iommu/entry_sync.c
>>new file mode 100644
>>index 000000000000..48d31270dbba
>>--- /dev/null
>>+++ b/drivers/iommu/entry_sync.c
>>@@ -0,0 +1,68 @@
>>+// SPDX-License-Identifier: GPL-2.0-only
>>+/*
>>+ * Helpers for drivers to update multi-quanta entries shared with HW without
>>+ * races to minimize breaking changes.
>>+ */
>>+#include "entry_sync.h"
>>+#include <linux/kconfig.h>
>>+#include <linux/atomic.h>
>>+
>>+#if IS_ENABLED(CONFIG_IOMMU_ENTRY_SYNC64)
>>+static bool entry_set64(struct entry_sync_writer64 *writer, __le64 *entry,
>>+			const __le64 *target, unsigned int start,
>>+			unsigned int len)
>>+{
>>+	bool changed = false;
>>+	unsigned int i;
>>+
>>+	for (i = start; len != 0; len--, i++) {
>>+		if (entry[i] != target[i]) {
>>+			WRITE_ONCE(entry[i], target[i]);
>>+			changed = true;
>>+		}
>>+	}
>>+
>>+	if (changed)
>>+		writer->ops->sync(writer);
>>+	return changed;
>>+}
>>+
>>+#define entry_sync_writer entry_sync_writer64
>>+#define quanta_t __le64
>>+#define NS(name) CONCATENATE(name, 64)
>>+#include "entry_sync_template.h"
>>+#endif
>>+
>>+#if IS_ENABLED(CONFIG_IOMMU_ENTRY_SYNC128)
>>+static bool entry_set128(struct entry_sync_writer128 *writer, u128 *entry,
>>+			 const u128 *target, unsigned int start,
>>+			 unsigned int len)
>>+{
>>+	bool changed = false;
>>+	unsigned int i;
>>+
>>+	for (i = start; len != 0; len--, i++) {
>>+		if (entry[i] != target[i]) {
>>+			/*
>>+			 * Use cmpxchg128 to generate an indivisible write from
>>+			 * the CPU to DMA'able memory. This must ensure that HW
>>+			 * sees either the new or old 128 bit value and not
>>+			 * something torn. As updates are serialized by a
>>+			 * spinlock, we use the local (unlocked) variant to
>>+			 * avoid unnecessary bus locking overhead.
>>+			 */
>>+			cmpxchg128_local(&entry[i], entry[i], target[i]);
>>+			changed = true;
>>+		}
>>+	}
>>+
>>+	if (changed)
>>+		writer->ops->sync(writer);
>>+	return changed;
>>+}
>>+
>>+#define entry_sync_writer entry_sync_writer128
>>+#define quanta_t u128
>>+#define NS(name) CONCATENATE(name, 128)
>>+#include "entry_sync_template.h"
>>+#endif
>>-- 
>>2.43.0
>>

Re: [PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Baolu Lu 3 weeks, 5 days ago

On 3/10/26 08:06, Samiullah Khawaja wrote:
> On Mon, Mar 09, 2026 at 11:33:23PM +0000, Samiullah Khawaja wrote:
>> On Mon, Mar 09, 2026 at 02:06:41PM +0800, Lu Baolu wrote:
>>> From: Jason Gunthorpe <jgg@nvidia.com>
>>>
>>> Many IOMMU implementations store data structures in host memory that can
>>> be quite big. The iommu is able to DMA read the host memory using an
>>> atomic quanta, usually 64 or 128 bits, and will read an entry using
>>> multiple quanta reads.
>>>
>>> Updating the host memory datastructure entry while the HW is 
>>> concurrently
>>> DMA'ing it is a little bit involved, but if you want to do this 
>>> hitlessly,
>>> while never making the entry non-valid, then it becomes quite 
>>> complicated.
>>>
>>> entry_sync is a library to handle this task. It works on the notion of
>>> "used bits" which reflect which bits the HW is actually sensitive to and
>>> which bits are ignored by hardware. Many hardware specifications say
>>> things like 'if mode is X then bits ABC are ignored'.
>>>
>>> Using the ignored bits entry_sync can often compute a series of ordered
>>> writes and flushes that will allow the entry to be updated while keeping
>>> it valid. If such an update is not possible then entry will be made
>>> temporarily non-valid.
>>>
>>> A 64 and 128 bit quanta version is provided to support existing iommus.
>>>
>>> Co-developed-by: Lu Baolu <baolu.lu@linux.intel.com>
>>> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
>>> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
>>> ---
>>> drivers/iommu/Kconfig               |  14 +++
>>> drivers/iommu/Makefile              |   1 +
>>> drivers/iommu/entry_sync.h          |  66 +++++++++++++
>>> drivers/iommu/entry_sync_template.h | 143 ++++++++++++++++++++++++++++
>>> drivers/iommu/entry_sync.c          |  68 +++++++++++++
>>> 5 files changed, 292 insertions(+)
>>> create mode 100644 drivers/iommu/entry_sync.h
>>> create mode 100644 drivers/iommu/entry_sync_template.h
>>> create mode 100644 drivers/iommu/entry_sync.c
>>>
>>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>>> index f86262b11416..2650c9fa125b 100644
>>> --- a/drivers/iommu/Kconfig
>>> +++ b/drivers/iommu/Kconfig
>>> @@ -145,6 +145,20 @@ config IOMMU_DEFAULT_PASSTHROUGH
>>>
>>> endchoice
>>>
>>> +config IOMMU_ENTRY_SYNC
>>> +    bool
>>> +    default n
>>> +
>>> +config IOMMU_ENTRY_SYNC64
>>> +    bool
>>> +    select IOMMU_ENTRY_SYNC
>>> +    default n
>>> +
>>> +config IOMMU_ENTRY_SYNC128
>>> +    bool
>>> +    select IOMMU_ENTRY_SYNC
>>> +    default n
>>> +
>>> config OF_IOMMU
>>>     def_bool y
>>>     depends on OF && IOMMU_API
>>> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
>>> index 0275821f4ef9..bd923995497a 100644
>>> --- a/drivers/iommu/Makefile
>>> +++ b/drivers/iommu/Makefile
>>> @@ -10,6 +10,7 @@ obj-$(CONFIG_IOMMU_API) += iommu-traces.o
>>> obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
>>> obj-$(CONFIG_IOMMU_DEBUGFS) += iommu-debugfs.o
>>> obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
>>> +obj-$(CONFIG_IOMMU_ENTRY_SYNC) += entry_sync.o
>>> obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
>>> obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
>>> obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
>>> diff --git a/drivers/iommu/entry_sync.h b/drivers/iommu/entry_sync.h
>>> new file mode 100644
>>> index 000000000000..004d421c71c0
>>> --- /dev/null
>>> +++ b/drivers/iommu/entry_sync.h
>>> @@ -0,0 +1,66 @@
>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>> +/*
>>> + * Many IOMMU implementations store data structures in host memory 
>>> that can be
>>> + * quite big. The iommu is able to DMA read the host memory using an 
>>> atomic
>>> + * quanta, usually 64 or 128 bits, and will read an entry using 
>>> multiple quanta
>>> + * reads.
>>> + *
>>> + * Updating the host memory datastructure entry while the HW is 
>>> concurrently
>>> + * DMA'ing it is a little bit involved, but if you want to do this 
>>> hitlessly,
>>> + * while never making the entry non-valid, then it becomes quite 
>>> complicated.
>>> + *
>>> + * entry_sync is a library to handle this task. It works on the 
>>> notion of "used
>>> + * bits" which reflect which bits the HW is actually sensitive to 
>>> and which bits
>>> + * are ignored by hardware. Many hardware specifications say things 
>>> like 'if
>>> + * mode is X then bits ABC are ignored'.
>>> + *
>>> + * Using the ignored bits entry_sync can often compute a series of 
>>> ordered
>>> + * writes and flushes that will allow the entry to be updated while 
>>> keeping it
>>> + * valid. If such an update is not possible then entry will be made 
>>> temporarily
>>> + * non-valid.
>>> + *
>>> + * A 64 and 128 bit quanta version is provided to support existing 
>>> iommus.
>>> + */
>>> +#ifndef IOMMU_ENTRY_SYNC_H
>>> +#define IOMMU_ENTRY_SYNC_H
>>> +
>>> +#include <linux/types.h>
>>> +#include <linux/compiler.h>
>>> +#include <linux/bug.h>
>>> +
>>> +/* Caller allocates a stack array of this length to call 
>>> entry_sync_write() */
>>> +#define ENTRY_SYNC_MEMORY_LEN(writer) ((writer)->num_quantas * 3)
>>> +
>>> +struct entry_sync_writer_ops64;
>>> +struct entry_sync_writer64 {
>>> +    const struct entry_sync_writer_ops64 *ops;
>>> +    size_t num_quantas;
>>> +    size_t vbit_quanta;
>>> +};
>>> +
>>> +struct entry_sync_writer_ops64 {
>>> +    void (*get_used)(const __le64 *entry, __le64 *used);
>>> +    void (*sync)(struct entry_sync_writer64 *writer);
>>> +};
>>> +
>>> +void entry_sync_write64(struct entry_sync_writer64 *writer, __le64 
>>> *entry,
>>> +            const __le64 *target, __le64 *memory,
>>> +            size_t memory_len);
>>> +
>>> +struct entry_sync_writer_ops128;
>>> +struct entry_sync_writer128 {
>>> +    const struct entry_sync_writer_ops128 *ops;
>>> +    size_t num_quantas;
>>> +    size_t vbit_quanta;
>>> +};
>>> +
>>> +struct entry_sync_writer_ops128 {
>>> +    void (*get_used)(const u128 *entry, u128 *used);
>>> +    void (*sync)(struct entry_sync_writer128 *writer);
>>> +};
>>> +
>>> +void entry_sync_write128(struct entry_sync_writer128 *writer, u128 
>>> *entry,
>>> +             const u128 *target, u128 *memory,
>>> +             size_t memory_len);
>>> +
>>> +#endif
>>> diff --git a/drivers/iommu/entry_sync_template.h b/drivers/iommu/ 
>>> entry_sync_template.h
>>> new file mode 100644
>>> index 000000000000..646f518b098e
>>> --- /dev/null
>>> +++ b/drivers/iommu/entry_sync_template.h
>>> @@ -0,0 +1,143 @@
>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>> +#include "entry_sync.h"
>>> +#include <linux/args.h>
>>> +#include <linux/bitops.h>
>>> +
>>> +#ifndef entry_sync_writer
>>> +#define entry_sync_writer entry_sync_writer64
>>> +#define quanta_t __le64
>>> +#define NS(name) CONCATENATE(name, 64)
>>> +#endif
>>> +
>>> +/*
>>> + * Figure out if we can do a hitless update of entry to become 
>>> target. Returns a
>>> + * bit mask where 1 indicates that a quanta word needs to be set 
>>> disruptively.
>>> + * unused_update is an intermediate value of entry that has unused 
>>> bits set to
>>> + * their new values.
>>> + */
>>> +static u8 NS(entry_quanta_diff)(struct entry_sync_writer *writer,
>>> +                const quanta_t *entry, const quanta_t *target,
>>> +                quanta_t *unused_update, quanta_t *memory)
>>> +{
>>> +    quanta_t *target_used = memory + writer->num_quantas * 1;
>>> +    quanta_t *cur_used = memory + writer->num_quantas * 2;
>>> +    u8 used_qword_diff = 0;
>>> +    unsigned int i;
>>> +
>>> +    writer->ops->get_used(entry, cur_used);
>>> +    writer->ops->get_used(target, target_used);
>>> +
>>> +    for (i = 0; i != writer->num_quantas; i++) {
>>> +        /*
>>> +         * Check that masks are up to date, the make functions are not
>>
>> nit: "the make functions" looks like a typo.

That seems to be a typo. Will clear it in v2.

>>> +         * allowed to set a bit to 1 if the used function doesn't 
>>> say it
>>> +         * is used.
>>> +         */
>>> +        WARN_ON_ONCE(target[i] & ~target_used[i]);
>>> +
>>> +        /* Bits can change because they are not currently being used */
>>> +        unused_update[i] = (entry[i] & cur_used[i]) |
>>> +                   (target[i] & ~cur_used[i]);
>>> +        /*
>>> +         * Each bit indicates that a used bit in a qword needs to be
>>> +         * changed after unused_update is applied.
>>> +         */
>>> +        if ((unused_update[i] & target_used[i]) != target[i])
>>> +            used_qword_diff |= 1 << i;
>>> +    }
>>> +    return used_qword_diff;
>>> +}
>>> +
>>> +/*
>>> + * Update the entry to the target configuration. The transition from 
>>> the current
>>> + * entry to the target entry takes place over multiple steps that 
>>> attempts to
>>> + * make the transition hitless if possible. This function takes care 
>>> not to
>>> + * create a situation where the HW can perceive a corrupted entry. 
>>> HW is only
>>> + * required to have a quanta-bit atomicity with stores from the CPU, 
>>> while
>>> + * entries are many quanta bit values big.
>>> + *
>>> + * The difference between the current value and the target value is 
>>> analyzed to
>>> + * determine which of three updates are required - disruptive, 
>>> hitless or no
>>> + * change.
>>> + *
>>> + * In the most general disruptive case we can make any update in 
>>> three steps:
>>> + *  - Disrupting the entry (V=0)
>>> + *  - Fill now unused quanta words, except qword 0 which contains V
>>> + *  - Make qword 0 have the final value and valid (V=1) with a 
>>> single 64
>>> + *    bit store
>>> + *
>>> + * However this disrupts the HW while it is happening. There are 
>>> several
>>> + * interesting cases where a STE/CD can be updated without 
>>> disturbing the HW
>>> + * because only a small number of bits are changing (S1DSS, CONFIG, 
>>> etc) or
>>> + * because the used bits don't intersect. We can detect this by 
>>> calculating how
>>> + * many 64 bit values need update after adjusting the unused bits 
>>> and skip the
>>> + * V=0 process. This relies on the IGNORED behavior described in the
>>> + * specification.
>>> + */
>>> +void NS(entry_sync_write)(struct entry_sync_writer *writer, quanta_t 
>>> *entry,
>>> +              const quanta_t *target, quanta_t *memory,
>>> +              size_t memory_len)
>>> +{
>>> +    quanta_t *unused_update = memory + writer->num_quantas * 0;
>>> +    u8 used_qword_diff;
>>> +
>>> +    if (WARN_ON(memory_len !=
>>> +            ENTRY_SYNC_MEMORY_LEN(writer) * sizeof(*memory)))
>>> +        return;
>>> +
>>> +    used_qword_diff = NS(entry_quanta_diff)(writer, entry, target,
>>> +                        unused_update, memory);
>>> +    if (hweight8(used_qword_diff) == 1) {
>>> +        /*
>>> +         * Only one quanta needs its used bits to be changed. This is a
>>> +         * hitless update, update all bits the current entry is 
>>> ignoring
>>> +         * to their new values, then update a single "critical quanta"
>>> +         * to change the entry and finally 0 out any bits that are now
>>> +         * unused in the target configuration.
>>> +         */
>>> +        unsigned int critical_qword_index = ffs(used_qword_diff) - 1;
>>> +
>>> +        /*
>>> +         * Skip writing unused bits in the critical quanta since we'll
>>> +         * be writing it in the next step anyways. This can save a sync
>>> +         * when the only change is in that quanta.
>>> +         */
>>> +        unused_update[critical_qword_index] =
>>> +            entry[critical_qword_index];
>>> +        NS(entry_set)(writer, entry, unused_update, 0,
>>> +                  writer->num_quantas);
>>> +        NS(entry_set)(writer, entry, target, critical_qword_index, 1);
>>> +        NS(entry_set)(writer, entry, target, 0, writer->num_quantas);
>>> +    } else if (used_qword_diff) {
>>> +        /*
>>> +         * At least two quantas need their inuse bits to be changed.
>>> +         * This requires a breaking update, zero the V bit, write all
>>> +         * qwords but 0, then set qword 0
>>> +         */
>>> +        unused_update[writer->vbit_quanta] = 0;
>>> +        NS(entry_set)(writer, entry, unused_update, writer- 
>>> >vbit_quanta, 1);
>>> +
>>> +        if (writer->vbit_quanta != 0)
>>> +            NS(entry_set)(writer, entry, target, 0,
>>> +                      writer->vbit_quanta - 1);
>>
>> Looking at the definition of the entry_set below, the last argument is
>> length. So if vbit_quanta 1 then it would write zero len. Shouldn't it
>> be writing quantas before the vbit_quanta?
>>> +        if (writer->vbit_quanta != writer->num_quantas)
> 
> Looking at this again, I think vbit_quanta can never be equal to
> num_quanta as num_quantas is length and vbit_quanta is index?
>>> +            NS(entry_set)(writer, entry, target,
>>> +                      writer->vbit_quanta,
> 
> Staring from vbit_quanta will set the present bit if it is set in the
> target?
>>> +                      writer->num_quantas - 1);
>>
>> Sami here, the last argument should not have "- 1".
> 
> I meant "Same here".

This branch is the disruptive update path. The process is:

1. Clear the Valid bit. The hardware now ignores this entry.
2. Write all the new data for the words before the Valid bit.
3. Write all the new data for the words after the Valid bit.
4. Write the word containing the Valid bit. The entry is now live again
    with all the new data.

Yes. The last argument for entry_set is length, not index. So perhaps I
could update it like this?

diff --git a/drivers/iommu/entry_sync_template.h 
b/drivers/iommu/entry_sync_template.h
index 646f518b098e..423cbb874919 100644
--- a/drivers/iommu/entry_sync_template.h
+++ b/drivers/iommu/entry_sync_template.h
@@ -118,12 +118,11 @@ void NS(entry_sync_write)(struct entry_sync_writer 
*writer, quanta_t *entry,
                 NS(entry_set)(writer, entry, unused_update, 
writer->vbit_quanta, 1);

                 if (writer->vbit_quanta != 0)
-                       NS(entry_set)(writer, entry, target, 0,
-                                     writer->vbit_quanta - 1);
-               if (writer->vbit_quanta != writer->num_quantas)
+                       NS(entry_set)(writer, entry, target, 0, 
writer->vbit_quanta);
+               if (writer->vbit_quanta + 1 < writer->num_quantas)
                         NS(entry_set)(writer, entry, target,
-                                     writer->vbit_quanta,
-                                     writer->num_quantas - 1);
+                                     writer->vbit_quanta + 1,
+                                     writer->num_quantas - 
writer->vbit_quanta - 1);

                 NS(entry_set)(writer, entry, target, 
writer->vbit_quanta, 1);
         } else {

Thanks,
baolu

Re: [PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Samiullah Khawaja 3 weeks, 2 days ago

On Sat, Mar 14, 2026 at 04:13:27PM +0800, Baolu Lu wrote:
>On 3/10/26 08:06, Samiullah Khawaja wrote:
>>On Mon, Mar 09, 2026 at 11:33:23PM +0000, Samiullah Khawaja wrote:
>>>On Mon, Mar 09, 2026 at 02:06:41PM +0800, Lu Baolu wrote:
>>>>From: Jason Gunthorpe <jgg@nvidia.com>
>>>>
>>>>Many IOMMU implementations store data structures in host memory that can
>>>>be quite big. The iommu is able to DMA read the host memory using an
>>>>atomic quanta, usually 64 or 128 bits, and will read an entry using
>>>>multiple quanta reads.
>>>>
>>>>Updating the host memory datastructure entry while the HW is 
>>>>concurrently
>>>>DMA'ing it is a little bit involved, but if you want to do this 
>>>>hitlessly,
>>>>while never making the entry non-valid, then it becomes quite 
>>>>complicated.
>>>>
>>>>entry_sync is a library to handle this task. It works on the notion of
>>>>"used bits" which reflect which bits the HW is actually sensitive to and
>>>>which bits are ignored by hardware. Many hardware specifications say
>>>>things like 'if mode is X then bits ABC are ignored'.
>>>>
>>>>Using the ignored bits entry_sync can often compute a series of ordered
>>>>writes and flushes that will allow the entry to be updated while keeping
>>>>it valid. If such an update is not possible then entry will be made
>>>>temporarily non-valid.
>>>>
>>>>A 64 and 128 bit quanta version is provided to support existing iommus.
>>>>
>>>>Co-developed-by: Lu Baolu <baolu.lu@linux.intel.com>
>>>>Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
>>>>Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
>>>>---
>>>>drivers/iommu/Kconfig               |  14 +++
>>>>drivers/iommu/Makefile              |   1 +
>>>>drivers/iommu/entry_sync.h          |  66 +++++++++++++
>>>>drivers/iommu/entry_sync_template.h | 143 ++++++++++++++++++++++++++++
>>>>drivers/iommu/entry_sync.c          |  68 +++++++++++++
>>>>5 files changed, 292 insertions(+)
>>>>create mode 100644 drivers/iommu/entry_sync.h
>>>>create mode 100644 drivers/iommu/entry_sync_template.h
>>>>create mode 100644 drivers/iommu/entry_sync.c
>>>>
>>>>diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>>>>index f86262b11416..2650c9fa125b 100644
>>>>--- a/drivers/iommu/Kconfig
>>>>+++ b/drivers/iommu/Kconfig
>>>>@@ -145,6 +145,20 @@ config IOMMU_DEFAULT_PASSTHROUGH
>>>>
>>>>endchoice
>>>>
>>>>+config IOMMU_ENTRY_SYNC
>>>>+    bool
>>>>+    default n
>>>>+
>>>>+config IOMMU_ENTRY_SYNC64
>>>>+    bool
>>>>+    select IOMMU_ENTRY_SYNC
>>>>+    default n
>>>>+
>>>>+config IOMMU_ENTRY_SYNC128
>>>>+    bool
>>>>+    select IOMMU_ENTRY_SYNC
>>>>+    default n
>>>>+
>>>>config OF_IOMMU
>>>>    def_bool y
>>>>    depends on OF && IOMMU_API
>>>>diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
>>>>index 0275821f4ef9..bd923995497a 100644
>>>>--- a/drivers/iommu/Makefile
>>>>+++ b/drivers/iommu/Makefile
>>>>@@ -10,6 +10,7 @@ obj-$(CONFIG_IOMMU_API) += iommu-traces.o
>>>>obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
>>>>obj-$(CONFIG_IOMMU_DEBUGFS) += iommu-debugfs.o
>>>>obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
>>>>+obj-$(CONFIG_IOMMU_ENTRY_SYNC) += entry_sync.o
>>>>obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
>>>>obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
>>>>obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
>>>>diff --git a/drivers/iommu/entry_sync.h b/drivers/iommu/entry_sync.h
>>>>new file mode 100644
>>>>index 000000000000..004d421c71c0
>>>>--- /dev/null
>>>>+++ b/drivers/iommu/entry_sync.h
>>>>@@ -0,0 +1,66 @@
>>>>+/* SPDX-License-Identifier: GPL-2.0-only */
>>>>+/*
>>>>+ * Many IOMMU implementations store data structures in host 
>>>>memory that can be
>>>>+ * quite big. The iommu is able to DMA read the host memory 
>>>>using an atomic
>>>>+ * quanta, usually 64 or 128 bits, and will read an entry using 
>>>>multiple quanta
>>>>+ * reads.
>>>>+ *
>>>>+ * Updating the host memory datastructure entry while the HW is 
>>>>concurrently
>>>>+ * DMA'ing it is a little bit involved, but if you want to do 
>>>>this hitlessly,
>>>>+ * while never making the entry non-valid, then it becomes 
>>>>quite complicated.
>>>>+ *
>>>>+ * entry_sync is a library to handle this task. It works on the 
>>>>notion of "used
>>>>+ * bits" which reflect which bits the HW is actually sensitive 
>>>>to and which bits
>>>>+ * are ignored by hardware. Many hardware specifications say 
>>>>things like 'if
>>>>+ * mode is X then bits ABC are ignored'.
>>>>+ *
>>>>+ * Using the ignored bits entry_sync can often compute a series 
>>>>of ordered
>>>>+ * writes and flushes that will allow the entry to be updated 
>>>>while keeping it
>>>>+ * valid. If such an update is not possible then entry will be 
>>>>made temporarily
>>>>+ * non-valid.
>>>>+ *
>>>>+ * A 64 and 128 bit quanta version is provided to support 
>>>>existing iommus.
>>>>+ */
>>>>+#ifndef IOMMU_ENTRY_SYNC_H
>>>>+#define IOMMU_ENTRY_SYNC_H
>>>>+
>>>>+#include <linux/types.h>
>>>>+#include <linux/compiler.h>
>>>>+#include <linux/bug.h>
>>>>+
>>>>+/* Caller allocates a stack array of this length to call 
>>>>entry_sync_write() */
>>>>+#define ENTRY_SYNC_MEMORY_LEN(writer) ((writer)->num_quantas * 3)
>>>>+
>>>>+struct entry_sync_writer_ops64;
>>>>+struct entry_sync_writer64 {
>>>>+    const struct entry_sync_writer_ops64 *ops;
>>>>+    size_t num_quantas;
>>>>+    size_t vbit_quanta;
>>>>+};
>>>>+
>>>>+struct entry_sync_writer_ops64 {
>>>>+    void (*get_used)(const __le64 *entry, __le64 *used);
>>>>+    void (*sync)(struct entry_sync_writer64 *writer);
>>>>+};
>>>>+
>>>>+void entry_sync_write64(struct entry_sync_writer64 *writer, 
>>>>__le64 *entry,
>>>>+            const __le64 *target, __le64 *memory,
>>>>+            size_t memory_len);
>>>>+
>>>>+struct entry_sync_writer_ops128;
>>>>+struct entry_sync_writer128 {
>>>>+    const struct entry_sync_writer_ops128 *ops;
>>>>+    size_t num_quantas;
>>>>+    size_t vbit_quanta;
>>>>+};
>>>>+
>>>>+struct entry_sync_writer_ops128 {
>>>>+    void (*get_used)(const u128 *entry, u128 *used);
>>>>+    void (*sync)(struct entry_sync_writer128 *writer);
>>>>+};
>>>>+
>>>>+void entry_sync_write128(struct entry_sync_writer128 *writer, 
>>>>u128 *entry,
>>>>+             const u128 *target, u128 *memory,
>>>>+             size_t memory_len);
>>>>+
>>>>+#endif
>>>>diff --git a/drivers/iommu/entry_sync_template.h 
>>>>b/drivers/iommu/ entry_sync_template.h
>>>>new file mode 100644
>>>>index 000000000000..646f518b098e
>>>>--- /dev/null
>>>>+++ b/drivers/iommu/entry_sync_template.h
>>>>@@ -0,0 +1,143 @@
>>>>+/* SPDX-License-Identifier: GPL-2.0-only */
>>>>+#include "entry_sync.h"
>>>>+#include <linux/args.h>
>>>>+#include <linux/bitops.h>
>>>>+
>>>>+#ifndef entry_sync_writer
>>>>+#define entry_sync_writer entry_sync_writer64
>>>>+#define quanta_t __le64
>>>>+#define NS(name) CONCATENATE(name, 64)
>>>>+#endif
>>>>+
>>>>+/*
>>>>+ * Figure out if we can do a hitless update of entry to become 
>>>>target. Returns a
>>>>+ * bit mask where 1 indicates that a quanta word needs to be 
>>>>set disruptively.
>>>>+ * unused_update is an intermediate value of entry that has 
>>>>unused bits set to
>>>>+ * their new values.
>>>>+ */
>>>>+static u8 NS(entry_quanta_diff)(struct entry_sync_writer *writer,
>>>>+                const quanta_t *entry, const quanta_t *target,
>>>>+                quanta_t *unused_update, quanta_t *memory)
>>>>+{
>>>>+    quanta_t *target_used = memory + writer->num_quantas * 1;
>>>>+    quanta_t *cur_used = memory + writer->num_quantas * 2;
>>>>+    u8 used_qword_diff = 0;
>>>>+    unsigned int i;
>>>>+
>>>>+    writer->ops->get_used(entry, cur_used);
>>>>+    writer->ops->get_used(target, target_used);
>>>>+
>>>>+    for (i = 0; i != writer->num_quantas; i++) {
>>>>+        /*
>>>>+         * Check that masks are up to date, the make functions are not
>>>
>>>nit: "the make functions" looks like a typo.
>
>That seems to be a typo. Will clear it in v2.
>
>>>>+         * allowed to set a bit to 1 if the used function 
>>>>doesn't say it
>>>>+         * is used.
>>>>+         */
>>>>+        WARN_ON_ONCE(target[i] & ~target_used[i]);
>>>>+
>>>>+        /* Bits can change because they are not currently being used */
>>>>+        unused_update[i] = (entry[i] & cur_used[i]) |
>>>>+                   (target[i] & ~cur_used[i]);
>>>>+        /*
>>>>+         * Each bit indicates that a used bit in a qword needs to be
>>>>+         * changed after unused_update is applied.
>>>>+         */
>>>>+        if ((unused_update[i] & target_used[i]) != target[i])
>>>>+            used_qword_diff |= 1 << i;
>>>>+    }
>>>>+    return used_qword_diff;
>>>>+}
>>>>+
>>>>+/*
>>>>+ * Update the entry to the target configuration. The transition 
>>>>from the current
>>>>+ * entry to the target entry takes place over multiple steps 
>>>>that attempts to
>>>>+ * make the transition hitless if possible. This function takes 
>>>>care not to
>>>>+ * create a situation where the HW can perceive a corrupted 
>>>>entry. HW is only
>>>>+ * required to have a quanta-bit atomicity with stores from the 
>>>>CPU, while
>>>>+ * entries are many quanta bit values big.
>>>>+ *
>>>>+ * The difference between the current value and the target 
>>>>value is analyzed to
>>>>+ * determine which of three updates are required - disruptive, 
>>>>hitless or no
>>>>+ * change.
>>>>+ *
>>>>+ * In the most general disruptive case we can make any update 
>>>>in three steps:
>>>>+ *  - Disrupting the entry (V=0)
>>>>+ *  - Fill now unused quanta words, except qword 0 which contains V
>>>>+ *  - Make qword 0 have the final value and valid (V=1) with a 
>>>>single 64
>>>>+ *    bit store
>>>>+ *
>>>>+ * However this disrupts the HW while it is happening. There 
>>>>are several
>>>>+ * interesting cases where a STE/CD can be updated without 
>>>>disturbing the HW
>>>>+ * because only a small number of bits are changing (S1DSS, 
>>>>CONFIG, etc) or
>>>>+ * because the used bits don't intersect. We can detect this by 
>>>>calculating how
>>>>+ * many 64 bit values need update after adjusting the unused 
>>>>bits and skip the
>>>>+ * V=0 process. This relies on the IGNORED behavior described in the
>>>>+ * specification.
>>>>+ */
>>>>+void NS(entry_sync_write)(struct entry_sync_writer *writer, 
>>>>quanta_t *entry,
>>>>+              const quanta_t *target, quanta_t *memory,
>>>>+              size_t memory_len)
>>>>+{
>>>>+    quanta_t *unused_update = memory + writer->num_quantas * 0;
>>>>+    u8 used_qword_diff;
>>>>+
>>>>+    if (WARN_ON(memory_len !=
>>>>+            ENTRY_SYNC_MEMORY_LEN(writer) * sizeof(*memory)))
>>>>+        return;
>>>>+
>>>>+    used_qword_diff = NS(entry_quanta_diff)(writer, entry, target,
>>>>+                        unused_update, memory);
>>>>+    if (hweight8(used_qword_diff) == 1) {
>>>>+        /*
>>>>+         * Only one quanta needs its used bits to be changed. This is a
>>>>+         * hitless update, update all bits the current entry is 
>>>>ignoring
>>>>+         * to their new values, then update a single "critical quanta"
>>>>+         * to change the entry and finally 0 out any bits that are now
>>>>+         * unused in the target configuration.
>>>>+         */
>>>>+        unsigned int critical_qword_index = ffs(used_qword_diff) - 1;
>>>>+
>>>>+        /*
>>>>+         * Skip writing unused bits in the critical quanta since we'll
>>>>+         * be writing it in the next step anyways. This can save a sync
>>>>+         * when the only change is in that quanta.
>>>>+         */
>>>>+        unused_update[critical_qword_index] =
>>>>+            entry[critical_qword_index];
>>>>+        NS(entry_set)(writer, entry, unused_update, 0,
>>>>+                  writer->num_quantas);
>>>>+        NS(entry_set)(writer, entry, target, critical_qword_index, 1);
>>>>+        NS(entry_set)(writer, entry, target, 0, writer->num_quantas);
>>>>+    } else if (used_qword_diff) {
>>>>+        /*
>>>>+         * At least two quantas need their inuse bits to be changed.
>>>>+         * This requires a breaking update, zero the V bit, write all
>>>>+         * qwords but 0, then set qword 0
>>>>+         */
>>>>+        unused_update[writer->vbit_quanta] = 0;
>>>>+        NS(entry_set)(writer, entry, unused_update, writer- 
>>>>>vbit_quanta, 1);
>>>>+
>>>>+        if (writer->vbit_quanta != 0)
>>>>+            NS(entry_set)(writer, entry, target, 0,
>>>>+                      writer->vbit_quanta - 1);
>>>
>>>Looking at the definition of the entry_set below, the last argument is
>>>length. So if vbit_quanta 1 then it would write zero len. Shouldn't it
>>>be writing quantas before the vbit_quanta?
>>>>+        if (writer->vbit_quanta != writer->num_quantas)
>>
>>Looking at this again, I think vbit_quanta can never be equal to
>>num_quanta as num_quantas is length and vbit_quanta is index?
>>>>+            NS(entry_set)(writer, entry, target,
>>>>+                      writer->vbit_quanta,
>>
>>Staring from vbit_quanta will set the present bit if it is set in the
>>target?
>>>>+                      writer->num_quantas - 1);
>>>
>>>Sami here, the last argument should not have "- 1".
>>
>>I meant "Same here".
>
>This branch is the disruptive update path. The process is:
>
>1. Clear the Valid bit. The hardware now ignores this entry.
>2. Write all the new data for the words before the Valid bit.
>3. Write all the new data for the words after the Valid bit.
>4. Write the word containing the Valid bit. The entry is now live again
>   with all the new data.
>
>Yes. The last argument for entry_set is length, not index. So perhaps I
>could update it like this?
>
>diff --git a/drivers/iommu/entry_sync_template.h 
>b/drivers/iommu/entry_sync_template.h
>index 646f518b098e..423cbb874919 100644
>--- a/drivers/iommu/entry_sync_template.h
>+++ b/drivers/iommu/entry_sync_template.h
>@@ -118,12 +118,11 @@ void NS(entry_sync_write)(struct 
>entry_sync_writer *writer, quanta_t *entry,
>                NS(entry_set)(writer, entry, unused_update, 
>writer->vbit_quanta, 1);
>
>                if (writer->vbit_quanta != 0)
>-                       NS(entry_set)(writer, entry, target, 0,
>-                                     writer->vbit_quanta - 1);
>-               if (writer->vbit_quanta != writer->num_quantas)
>+                       NS(entry_set)(writer, entry, target, 0, 
>writer->vbit_quanta);
>+               if (writer->vbit_quanta + 1 < writer->num_quantas)
>                        NS(entry_set)(writer, entry, target,
>-                                     writer->vbit_quanta,
>-                                     writer->num_quantas - 1);
>+                                     writer->vbit_quanta + 1,
>+                                     writer->num_quantas - 
>writer->vbit_quanta - 1);

This looks good.

nit: I am wondering whether we can change the arguments to the function,
by modifying the loop in entry_set, to be start and end quanta instead?
That way the caller doesn't have to do these bound checks? What do you
think?
>
>
>                NS(entry_set)(writer, entry, target, 
>writer->vbit_quanta, 1);
>        } else {
>
>Thanks,
>baolu

Thanks,
Sami

Re: [PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Baolu Lu 3 weeks, 1 day ago

On 3/17/26 00:35, Samiullah Khawaja wrote:
> On Sat, Mar 14, 2026 at 04:13:27PM +0800, Baolu Lu wrote:
>> On 3/10/26 08:06, Samiullah Khawaja wrote:
>>> On Mon, Mar 09, 2026 at 11:33:23PM +0000, Samiullah Khawaja wrote:
>>>> On Mon, Mar 09, 2026 at 02:06:41PM +0800, Lu Baolu wrote:
>>>>> From: Jason Gunthorpe <jgg@nvidia.com>
>>>>>
>>>>> Many IOMMU implementations store data structures in host memory 
>>>>> that can
>>>>> be quite big. The iommu is able to DMA read the host memory using an
>>>>> atomic quanta, usually 64 or 128 bits, and will read an entry using
>>>>> multiple quanta reads.
>>>>>
>>>>> Updating the host memory datastructure entry while the HW is 
>>>>> concurrently
>>>>> DMA'ing it is a little bit involved, but if you want to do this 
>>>>> hitlessly,
>>>>> while never making the entry non-valid, then it becomes quite 
>>>>> complicated.
>>>>>
>>>>> entry_sync is a library to handle this task. It works on the notion of
>>>>> "used bits" which reflect which bits the HW is actually sensitive 
>>>>> to and
>>>>> which bits are ignored by hardware. Many hardware specifications say
>>>>> things like 'if mode is X then bits ABC are ignored'.
>>>>>
>>>>> Using the ignored bits entry_sync can often compute a series of 
>>>>> ordered
>>>>> writes and flushes that will allow the entry to be updated while 
>>>>> keeping
>>>>> it valid. If such an update is not possible then entry will be made
>>>>> temporarily non-valid.
>>>>>
>>>>> A 64 and 128 bit quanta version is provided to support existing 
>>>>> iommus.
>>>>>
>>>>> Co-developed-by: Lu Baolu <baolu.lu@linux.intel.com>
>>>>> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
>>>>> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
>>>>> ---
>>>>> drivers/iommu/Kconfig               |  14 +++
>>>>> drivers/iommu/Makefile              |   1 +
>>>>> drivers/iommu/entry_sync.h          |  66 +++++++++++++
>>>>> drivers/iommu/entry_sync_template.h | 143 ++++++++++++++++++++++++++++
>>>>> drivers/iommu/entry_sync.c          |  68 +++++++++++++
>>>>> 5 files changed, 292 insertions(+)
>>>>> create mode 100644 drivers/iommu/entry_sync.h
>>>>> create mode 100644 drivers/iommu/entry_sync_template.h
>>>>> create mode 100644 drivers/iommu/entry_sync.c
>>>>>
>>>>> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
>>>>> index f86262b11416..2650c9fa125b 100644
>>>>> --- a/drivers/iommu/Kconfig
>>>>> +++ b/drivers/iommu/Kconfig
>>>>> @@ -145,6 +145,20 @@ config IOMMU_DEFAULT_PASSTHROUGH
>>>>>
>>>>> endchoice
>>>>>
>>>>> +config IOMMU_ENTRY_SYNC
>>>>> +    bool
>>>>> +    default n
>>>>> +
>>>>> +config IOMMU_ENTRY_SYNC64
>>>>> +    bool
>>>>> +    select IOMMU_ENTRY_SYNC
>>>>> +    default n
>>>>> +
>>>>> +config IOMMU_ENTRY_SYNC128
>>>>> +    bool
>>>>> +    select IOMMU_ENTRY_SYNC
>>>>> +    default n
>>>>> +
>>>>> config OF_IOMMU
>>>>>     def_bool y
>>>>>     depends on OF && IOMMU_API
>>>>> diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
>>>>> index 0275821f4ef9..bd923995497a 100644
>>>>> --- a/drivers/iommu/Makefile
>>>>> +++ b/drivers/iommu/Makefile
>>>>> @@ -10,6 +10,7 @@ obj-$(CONFIG_IOMMU_API) += iommu-traces.o
>>>>> obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o
>>>>> obj-$(CONFIG_IOMMU_DEBUGFS) += iommu-debugfs.o
>>>>> obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o
>>>>> +obj-$(CONFIG_IOMMU_ENTRY_SYNC) += entry_sync.o
>>>>> obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o
>>>>> obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o
>>>>> obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o
>>>>> diff --git a/drivers/iommu/entry_sync.h b/drivers/iommu/entry_sync.h
>>>>> new file mode 100644
>>>>> index 000000000000..004d421c71c0
>>>>> --- /dev/null
>>>>> +++ b/drivers/iommu/entry_sync.h
>>>>> @@ -0,0 +1,66 @@
>>>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>>>> +/*
>>>>> + * Many IOMMU implementations store data structures in host memory 
>>>>> that can be
>>>>> + * quite big. The iommu is able to DMA read the host memory using 
>>>>> an atomic
>>>>> + * quanta, usually 64 or 128 bits, and will read an entry using 
>>>>> multiple quanta
>>>>> + * reads.
>>>>> + *
>>>>> + * Updating the host memory datastructure entry while the HW is 
>>>>> concurrently
>>>>> + * DMA'ing it is a little bit involved, but if you want to do this 
>>>>> hitlessly,
>>>>> + * while never making the entry non-valid, then it becomes quite 
>>>>> complicated.
>>>>> + *
>>>>> + * entry_sync is a library to handle this task. It works on the 
>>>>> notion of "used
>>>>> + * bits" which reflect which bits the HW is actually sensitive to 
>>>>> and which bits
>>>>> + * are ignored by hardware. Many hardware specifications say 
>>>>> things like 'if
>>>>> + * mode is X then bits ABC are ignored'.
>>>>> + *
>>>>> + * Using the ignored bits entry_sync can often compute a series of 
>>>>> ordered
>>>>> + * writes and flushes that will allow the entry to be updated 
>>>>> while keeping it
>>>>> + * valid. If such an update is not possible then entry will be 
>>>>> made temporarily
>>>>> + * non-valid.
>>>>> + *
>>>>> + * A 64 and 128 bit quanta version is provided to support existing 
>>>>> iommus.
>>>>> + */
>>>>> +#ifndef IOMMU_ENTRY_SYNC_H
>>>>> +#define IOMMU_ENTRY_SYNC_H
>>>>> +
>>>>> +#include <linux/types.h>
>>>>> +#include <linux/compiler.h>
>>>>> +#include <linux/bug.h>
>>>>> +
>>>>> +/* Caller allocates a stack array of this length to call 
>>>>> entry_sync_write() */
>>>>> +#define ENTRY_SYNC_MEMORY_LEN(writer) ((writer)->num_quantas * 3)
>>>>> +
>>>>> +struct entry_sync_writer_ops64;
>>>>> +struct entry_sync_writer64 {
>>>>> +    const struct entry_sync_writer_ops64 *ops;
>>>>> +    size_t num_quantas;
>>>>> +    size_t vbit_quanta;
>>>>> +};
>>>>> +
>>>>> +struct entry_sync_writer_ops64 {
>>>>> +    void (*get_used)(const __le64 *entry, __le64 *used);
>>>>> +    void (*sync)(struct entry_sync_writer64 *writer);
>>>>> +};
>>>>> +
>>>>> +void entry_sync_write64(struct entry_sync_writer64 *writer, __le64 
>>>>> *entry,
>>>>> +            const __le64 *target, __le64 *memory,
>>>>> +            size_t memory_len);
>>>>> +
>>>>> +struct entry_sync_writer_ops128;
>>>>> +struct entry_sync_writer128 {
>>>>> +    const struct entry_sync_writer_ops128 *ops;
>>>>> +    size_t num_quantas;
>>>>> +    size_t vbit_quanta;
>>>>> +};
>>>>> +
>>>>> +struct entry_sync_writer_ops128 {
>>>>> +    void (*get_used)(const u128 *entry, u128 *used);
>>>>> +    void (*sync)(struct entry_sync_writer128 *writer);
>>>>> +};
>>>>> +
>>>>> +void entry_sync_write128(struct entry_sync_writer128 *writer, u128 
>>>>> *entry,
>>>>> +             const u128 *target, u128 *memory,
>>>>> +             size_t memory_len);
>>>>> +
>>>>> +#endif
>>>>> diff --git a/drivers/iommu/entry_sync_template.h b/drivers/iommu/ 
>>>>> entry_sync_template.h
>>>>> new file mode 100644
>>>>> index 000000000000..646f518b098e
>>>>> --- /dev/null
>>>>> +++ b/drivers/iommu/entry_sync_template.h
>>>>> @@ -0,0 +1,143 @@
>>>>> +/* SPDX-License-Identifier: GPL-2.0-only */
>>>>> +#include "entry_sync.h"
>>>>> +#include <linux/args.h>
>>>>> +#include <linux/bitops.h>
>>>>> +
>>>>> +#ifndef entry_sync_writer
>>>>> +#define entry_sync_writer entry_sync_writer64
>>>>> +#define quanta_t __le64
>>>>> +#define NS(name) CONCATENATE(name, 64)
>>>>> +#endif
>>>>> +
>>>>> +/*
>>>>> + * Figure out if we can do a hitless update of entry to become 
>>>>> target. Returns a
>>>>> + * bit mask where 1 indicates that a quanta word needs to be set 
>>>>> disruptively.
>>>>> + * unused_update is an intermediate value of entry that has unused 
>>>>> bits set to
>>>>> + * their new values.
>>>>> + */
>>>>> +static u8 NS(entry_quanta_diff)(struct entry_sync_writer *writer,
>>>>> +                const quanta_t *entry, const quanta_t *target,
>>>>> +                quanta_t *unused_update, quanta_t *memory)
>>>>> +{
>>>>> +    quanta_t *target_used = memory + writer->num_quantas * 1;
>>>>> +    quanta_t *cur_used = memory + writer->num_quantas * 2;
>>>>> +    u8 used_qword_diff = 0;
>>>>> +    unsigned int i;
>>>>> +
>>>>> +    writer->ops->get_used(entry, cur_used);
>>>>> +    writer->ops->get_used(target, target_used);
>>>>> +
>>>>> +    for (i = 0; i != writer->num_quantas; i++) {
>>>>> +        /*
>>>>> +         * Check that masks are up to date, the make functions are 
>>>>> not
>>>>
>>>> nit: "the make functions" looks like a typo.
>>
>> That seems to be a typo. Will clear it in v2.
>>
>>>>> +         * allowed to set a bit to 1 if the used function doesn't 
>>>>> say it
>>>>> +         * is used.
>>>>> +         */
>>>>> +        WARN_ON_ONCE(target[i] & ~target_used[i]);
>>>>> +
>>>>> +        /* Bits can change because they are not currently being 
>>>>> used */
>>>>> +        unused_update[i] = (entry[i] & cur_used[i]) |
>>>>> +                   (target[i] & ~cur_used[i]);
>>>>> +        /*
>>>>> +         * Each bit indicates that a used bit in a qword needs to be
>>>>> +         * changed after unused_update is applied.
>>>>> +         */
>>>>> +        if ((unused_update[i] & target_used[i]) != target[i])
>>>>> +            used_qword_diff |= 1 << i;
>>>>> +    }
>>>>> +    return used_qword_diff;
>>>>> +}
>>>>> +
>>>>> +/*
>>>>> + * Update the entry to the target configuration. The transition 
>>>>> from the current
>>>>> + * entry to the target entry takes place over multiple steps that 
>>>>> attempts to
>>>>> + * make the transition hitless if possible. This function takes 
>>>>> care not to
>>>>> + * create a situation where the HW can perceive a corrupted entry. 
>>>>> HW is only
>>>>> + * required to have a quanta-bit atomicity with stores from the 
>>>>> CPU, while
>>>>> + * entries are many quanta bit values big.
>>>>> + *
>>>>> + * The difference between the current value and the target value 
>>>>> is analyzed to
>>>>> + * determine which of three updates are required - disruptive, 
>>>>> hitless or no
>>>>> + * change.
>>>>> + *
>>>>> + * In the most general disruptive case we can make any update in 
>>>>> three steps:
>>>>> + *  - Disrupting the entry (V=0)
>>>>> + *  - Fill now unused quanta words, except qword 0 which contains V
>>>>> + *  - Make qword 0 have the final value and valid (V=1) with a 
>>>>> single 64
>>>>> + *    bit store
>>>>> + *
>>>>> + * However this disrupts the HW while it is happening. There are 
>>>>> several
>>>>> + * interesting cases where a STE/CD can be updated without 
>>>>> disturbing the HW
>>>>> + * because only a small number of bits are changing (S1DSS, 
>>>>> CONFIG, etc) or
>>>>> + * because the used bits don't intersect. We can detect this by 
>>>>> calculating how
>>>>> + * many 64 bit values need update after adjusting the unused bits 
>>>>> and skip the
>>>>> + * V=0 process. This relies on the IGNORED behavior described in the
>>>>> + * specification.
>>>>> + */
>>>>> +void NS(entry_sync_write)(struct entry_sync_writer *writer, 
>>>>> quanta_t *entry,
>>>>> +              const quanta_t *target, quanta_t *memory,
>>>>> +              size_t memory_len)
>>>>> +{
>>>>> +    quanta_t *unused_update = memory + writer->num_quantas * 0;
>>>>> +    u8 used_qword_diff;
>>>>> +
>>>>> +    if (WARN_ON(memory_len !=
>>>>> +            ENTRY_SYNC_MEMORY_LEN(writer) * sizeof(*memory)))
>>>>> +        return;
>>>>> +
>>>>> +    used_qword_diff = NS(entry_quanta_diff)(writer, entry, target,
>>>>> +                        unused_update, memory);
>>>>> +    if (hweight8(used_qword_diff) == 1) {
>>>>> +        /*
>>>>> +         * Only one quanta needs its used bits to be changed. This 
>>>>> is a
>>>>> +         * hitless update, update all bits the current entry is 
>>>>> ignoring
>>>>> +         * to their new values, then update a single "critical 
>>>>> quanta"
>>>>> +         * to change the entry and finally 0 out any bits that are 
>>>>> now
>>>>> +         * unused in the target configuration.
>>>>> +         */
>>>>> +        unsigned int critical_qword_index = ffs(used_qword_diff) - 1;
>>>>> +
>>>>> +        /*
>>>>> +         * Skip writing unused bits in the critical quanta since 
>>>>> we'll
>>>>> +         * be writing it in the next step anyways. This can save a 
>>>>> sync
>>>>> +         * when the only change is in that quanta.
>>>>> +         */
>>>>> +        unused_update[critical_qword_index] =
>>>>> +            entry[critical_qword_index];
>>>>> +        NS(entry_set)(writer, entry, unused_update, 0,
>>>>> +                  writer->num_quantas);
>>>>> +        NS(entry_set)(writer, entry, target, critical_qword_index, 
>>>>> 1);
>>>>> +        NS(entry_set)(writer, entry, target, 0, writer->num_quantas);
>>>>> +    } else if (used_qword_diff) {
>>>>> +        /*
>>>>> +         * At least two quantas need their inuse bits to be changed.
>>>>> +         * This requires a breaking update, zero the V bit, write all
>>>>> +         * qwords but 0, then set qword 0
>>>>> +         */
>>>>> +        unused_update[writer->vbit_quanta] = 0;
>>>>> +        NS(entry_set)(writer, entry, unused_update, writer-
>>>>>> vbit_quanta, 1);
>>>>> +
>>>>> +        if (writer->vbit_quanta != 0)
>>>>> +            NS(entry_set)(writer, entry, target, 0,
>>>>> +                      writer->vbit_quanta - 1);
>>>>
>>>> Looking at the definition of the entry_set below, the last argument is
>>>> length. So if vbit_quanta 1 then it would write zero len. Shouldn't it
>>>> be writing quantas before the vbit_quanta?
>>>>> +        if (writer->vbit_quanta != writer->num_quantas)
>>>
>>> Looking at this again, I think vbit_quanta can never be equal to
>>> num_quanta as num_quantas is length and vbit_quanta is index?
>>>>> +            NS(entry_set)(writer, entry, target,
>>>>> +                      writer->vbit_quanta,
>>>
>>> Staring from vbit_quanta will set the present bit if it is set in the
>>> target?
>>>>> +                      writer->num_quantas - 1);
>>>>
>>>> Sami here, the last argument should not have "- 1".
>>>
>>> I meant "Same here".
>>
>> This branch is the disruptive update path. The process is:
>>
>> 1. Clear the Valid bit. The hardware now ignores this entry.
>> 2. Write all the new data for the words before the Valid bit.
>> 3. Write all the new data for the words after the Valid bit.
>> 4. Write the word containing the Valid bit. The entry is now live again
>>   with all the new data.
>>
>> Yes. The last argument for entry_set is length, not index. So perhaps I
>> could update it like this?
>>
>> diff --git a/drivers/iommu/entry_sync_template.h b/drivers/iommu/ 
>> entry_sync_template.h
>> index 646f518b098e..423cbb874919 100644
>> --- a/drivers/iommu/entry_sync_template.h
>> +++ b/drivers/iommu/entry_sync_template.h
>> @@ -118,12 +118,11 @@ void NS(entry_sync_write)(struct 
>> entry_sync_writer *writer, quanta_t *entry,
>>                NS(entry_set)(writer, entry, unused_update, writer- 
>> >vbit_quanta, 1);
>>
>>                if (writer->vbit_quanta != 0)
>> -                       NS(entry_set)(writer, entry, target, 0,
>> -                                     writer->vbit_quanta - 1);
>> -               if (writer->vbit_quanta != writer->num_quantas)
>> +                       NS(entry_set)(writer, entry, target, 0, 
>> writer->vbit_quanta);
>> +               if (writer->vbit_quanta + 1 < writer->num_quantas)
>>                        NS(entry_set)(writer, entry, target,
>> -                                     writer->vbit_quanta,
>> -                                     writer->num_quantas - 1);
>> +                                     writer->vbit_quanta + 1,
>> +                                     writer->num_quantas - writer- 
>> >vbit_quanta - 1);
> 
> This looks good.
> 
> nit: I am wondering whether we can change the arguments to the function,
> by modifying the loop in entry_set, to be start and end quanta instead?
> That way the caller doesn't have to do these bound checks? What do you
> think?

I have no strong opinion about this. It appears that Linux kernel memory
set or copy functions follow the (pointer, offset, length) pattern,
moving away from this might cause confusion for other developers.
Anyway, if other guys also like that way, I am fine to adjust it.

Thanks,
baolu

Re: [PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Will Deacon 3 weeks, 3 days ago

On Sat, Mar 14, 2026 at 04:13:27PM +0800, Baolu Lu wrote:
> On 3/10/26 08:06, Samiullah Khawaja wrote:
> > On Mon, Mar 09, 2026 at 11:33:23PM +0000, Samiullah Khawaja wrote:
> > > On Mon, Mar 09, 2026 at 02:06:41PM +0800, Lu Baolu wrote:
> > > > From: Jason Gunthorpe <jgg@nvidia.com>
> > > > 
> > > > Many IOMMU implementations store data structures in host memory that can
> > > > be quite big. The iommu is able to DMA read the host memory using an
> > > > atomic quanta, usually 64 or 128 bits, and will read an entry using
> > > > multiple quanta reads.
> > > > 
> > > > Updating the host memory datastructure entry while the HW is
> > > > concurrently
> > > > DMA'ing it is a little bit involved, but if you want to do this
> > > > hitlessly,
> > > > while never making the entry non-valid, then it becomes quite
> > > > complicated.
> > > > 
> > > > entry_sync is a library to handle this task. It works on the notion of
> > > > "used bits" which reflect which bits the HW is actually sensitive to and
> > > > which bits are ignored by hardware. Many hardware specifications say
> > > > things like 'if mode is X then bits ABC are ignored'.
> > > > 
> > > > Using the ignored bits entry_sync can often compute a series of ordered
> > > > writes and flushes that will allow the entry to be updated while keeping
> > > > it valid. If such an update is not possible then entry will be made
> > > > temporarily non-valid.
> > > > 
> > > > A 64 and 128 bit quanta version is provided to support existing iommus.
> > > > 
> > > > Co-developed-by: Lu Baolu <baolu.lu@linux.intel.com>
> > > > Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> > > > Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
> > > > ---
> > > > drivers/iommu/Kconfig               |  14 +++
> > > > drivers/iommu/Makefile              |   1 +
> > > > drivers/iommu/entry_sync.h          |  66 +++++++++++++
> > > > drivers/iommu/entry_sync_template.h | 143 ++++++++++++++++++++++++++++
> > > > drivers/iommu/entry_sync.c          |  68 +++++++++++++
> > > > 5 files changed, 292 insertions(+)
> > > > create mode 100644 drivers/iommu/entry_sync.h
> > > > create mode 100644 drivers/iommu/entry_sync_template.h
> > > > create mode 100644 drivers/iommu/entry_sync.c

Shouldn't we move the SMMU driver over to this, rather than copy-pasting
everything? If not, then why is it in generic IOMMU code?

Will

Re: [PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Baolu Lu 3 weeks, 1 day ago

On 3/16/26 17:51, Will Deacon wrote:
> On Sat, Mar 14, 2026 at 04:13:27PM +0800, Baolu Lu wrote:
>> On 3/10/26 08:06, Samiullah Khawaja wrote:
>>> On Mon, Mar 09, 2026 at 11:33:23PM +0000, Samiullah Khawaja wrote:
>>>> On Mon, Mar 09, 2026 at 02:06:41PM +0800, Lu Baolu wrote:
>>>>> From: Jason Gunthorpe<jgg@nvidia.com>
>>>>>
>>>>> Many IOMMU implementations store data structures in host memory that can
>>>>> be quite big. The iommu is able to DMA read the host memory using an
>>>>> atomic quanta, usually 64 or 128 bits, and will read an entry using
>>>>> multiple quanta reads.
>>>>>
>>>>> Updating the host memory datastructure entry while the HW is
>>>>> concurrently
>>>>> DMA'ing it is a little bit involved, but if you want to do this
>>>>> hitlessly,
>>>>> while never making the entry non-valid, then it becomes quite
>>>>> complicated.
>>>>>
>>>>> entry_sync is a library to handle this task. It works on the notion of
>>>>> "used bits" which reflect which bits the HW is actually sensitive to and
>>>>> which bits are ignored by hardware. Many hardware specifications say
>>>>> things like 'if mode is X then bits ABC are ignored'.
>>>>>
>>>>> Using the ignored bits entry_sync can often compute a series of ordered
>>>>> writes and flushes that will allow the entry to be updated while keeping
>>>>> it valid. If such an update is not possible then entry will be made
>>>>> temporarily non-valid.
>>>>>
>>>>> A 64 and 128 bit quanta version is provided to support existing iommus.
>>>>>
>>>>> Co-developed-by: Lu Baolu<baolu.lu@linux.intel.com>
>>>>> Signed-off-by: Lu Baolu<baolu.lu@linux.intel.com>
>>>>> Signed-off-by: Jason Gunthorpe<jgg@nvidia.com>
>>>>> ---
>>>>> drivers/iommu/Kconfig               |  14 +++
>>>>> drivers/iommu/Makefile              |   1 +
>>>>> drivers/iommu/entry_sync.h          |  66 +++++++++++++
>>>>> drivers/iommu/entry_sync_template.h | 143 ++++++++++++++++++++++++++++
>>>>> drivers/iommu/entry_sync.c          |  68 +++++++++++++
>>>>> 5 files changed, 292 insertions(+)
>>>>> create mode 100644 drivers/iommu/entry_sync.h
>>>>> create mode 100644 drivers/iommu/entry_sync_template.h
>>>>> create mode 100644 drivers/iommu/entry_sync.c
> Shouldn't we move the SMMU driver over to this, rather than copy-pasting
> everything? If not, then why is it in generic IOMMU code?

Yes. I will start to do this from the next version.

Thanks,
baolu

Re: [PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Jason Gunthorpe 2 weeks, 2 days ago

On Wed, Mar 18, 2026 at 11:10:12AM +0800, Baolu Lu wrote:
> > Shouldn't we move the SMMU driver over to this, rather than copy-pasting
> > everything? If not, then why is it in generic IOMMU code?
> 
> Yes. I will start to do this from the next version.

I had written a draft already:

https://github.com/jgunthorpe/linux/commit/cda5c27a4020d162948259df9d3c8dd61196290a

Jason

Re: [PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3

Posted by Baolu Lu 2 weeks, 2 days ago

On 3/23/26 20:55, Jason Gunthorpe wrote:
> On Wed, Mar 18, 2026 at 11:10:12AM +0800, Baolu Lu wrote:
>>> Shouldn't we move the SMMU driver over to this, rather than copy-pasting
>>> everything? If not, then why is it in generic IOMMU code?
>> Yes. I will start to do this from the next version.
> I had written a draft already:
> 
> https://github.com/jgunthorpe/linux/commit/ 
> cda5c27a4020d162948259df9d3c8dd61196290a

Yeah, I will include this in the next version.

Thanks,
baolu

[PATCH 1/8] iommu: Lift and generalize the STE/CD update code from SMMUv3
[PATCH 2/8] iommu/vt-d: Add entry_sync support for PASID entry updates
[PATCH 3/8] iommu/vt-d: Require CMPXCHG16B for PASID support
[PATCH 4/8] iommu/vt-d: Add trace events for PASID entry sync updates
[PATCH 5/8] iommu/vt-d: Use intel_pasid_write() for first-stage setup
[PATCH 6/8] iommu/vt-d: Use intel_pasid_write() for second-stage setup
[PATCH 7/8] iommu/vt-d: Use intel_pasid_write() for pass-through setup
[PATCH 8/8] iommu/vt-d: Use intel_pasid_write() for nested setup