[PATCH 1/3] iommu/vt-d: Use 128-bit atomic updates for context entries

Lu Baolu posted 3 patches 3 weeks, 4 days ago
There is a newer version of this series
[PATCH 1/3] iommu/vt-d: Use 128-bit atomic updates for context entries
Posted by Lu Baolu 3 weeks, 4 days ago
On Intel IOMMU, device context entries are accessed by hardware in
128-bit chunks. Currently, the driver updates these entries by
programming the 'lo' and 'hi' 64-bit fields individually.

This creates a potential race condition where the IOMMU hardware may fetch
a context entry while the CPU has only completed one of the two 64-bit
writes. This "torn" entry — consisting of half-old and half-new data —
could lead to unpredictable hardware behavior, especially when
transitioning the 'Present' bit or changing translation types.

To ensure the IOMMU hardware always observes a consistent state, use
128-bit atomic updates for context entries. This is achieved by building
context entries on the stack and write them to the table in a single
operation.

As this relies on arch_cmpxchg128_local(), restrict INTEL_IOMMU
dependencies to X86_64.

Fixes: ba39592764ed2 ("Intel IOMMU: Intel IOMMU driver")
Reported-by: Dmytro Maluka <dmaluka@chromium.org>
Closes: https://lore.kernel.org/all/aTG7gc7I5wExai3S@google.com/
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 drivers/iommu/intel/Kconfig |  2 +-
 drivers/iommu/intel/iommu.h | 22 ++++++++++++++++++----
 drivers/iommu/intel/iommu.c | 30 +++++++++++++++---------------
 drivers/iommu/intel/pasid.c | 18 +++++++++---------
 4 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 5471f814e073..efda19820f95 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -11,7 +11,7 @@ config DMAR_DEBUG
 
 config INTEL_IOMMU
 	bool "Support for Intel IOMMU using DMA Remapping Devices"
-	depends on PCI_MSI && ACPI && X86
+	depends on PCI_MSI && ACPI && X86_64
 	select IOMMU_API
 	select GENERIC_PT
 	select IOMMU_PT
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 25c5e22096d4..b8999802f401 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -546,6 +546,16 @@ struct pasid_entry;
 struct pasid_state_entry;
 struct page_req_dsc;
 
+static __always_inline void intel_iommu_atomic128_set(u128 *ptr, u128 val)
+{
+	/*
+	 * Use the cmpxchg16b instruction for 128-bit atomicity. As updates
+	 * are serialized by a spinlock, we use the local (unlocked) variant
+	 * to avoid unnecessary bus locking overhead.
+	 */
+	arch_cmpxchg128_local(ptr, *ptr, val);
+}
+
 /*
  * 0: Present
  * 1-11: Reserved
@@ -569,8 +579,13 @@ struct root_entry {
  * 8-23: domain id
  */
 struct context_entry {
-	u64 lo;
-	u64 hi;
+	union {
+		struct {
+			u64 lo;
+			u64 hi;
+		};
+		u128 val128;
+	};
 };
 
 struct iommu_domain_info {
@@ -946,8 +961,7 @@ static inline int context_domain_id(struct context_entry *c)
 
 static inline void context_clear_entry(struct context_entry *context)
 {
-	context->lo = 0;
-	context->hi = 0;
+	intel_iommu_atomic128_set(&context->val128, 0);
 }
 
 #ifdef CONFIG_INTEL_IOMMU
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 134302fbcd92..d721061ebda2 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -1147,8 +1147,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 			domain_lookup_dev_info(domain, iommu, bus, devfn);
 	u16 did = domain_id_iommu(domain, iommu);
 	int translation = CONTEXT_TT_MULTI_LEVEL;
+	struct context_entry *context, new = {0};
 	struct pt_iommu_vtdss_hw_info pt_info;
-	struct context_entry *context;
 	int ret;
 
 	if (WARN_ON(!intel_domain_is_ss_paging(domain)))
@@ -1170,19 +1170,19 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
 		goto out_unlock;
 
 	copied_context_tear_down(iommu, context, bus, devfn);
-	context_clear_entry(context);
-	context_set_domain_id(context, did);
+	context_set_domain_id(&new, did);
 
 	if (info && info->ats_supported)
 		translation = CONTEXT_TT_DEV_IOTLB;
 	else
 		translation = CONTEXT_TT_MULTI_LEVEL;
 
-	context_set_address_root(context, pt_info.ssptptr);
-	context_set_address_width(context, pt_info.aw);
-	context_set_translation_type(context, translation);
-	context_set_fault_enable(context);
-	context_set_present(context);
+	context_set_address_root(&new, pt_info.ssptptr);
+	context_set_address_width(&new, pt_info.aw);
+	context_set_translation_type(&new, translation);
+	context_set_fault_enable(&new);
+	context_set_present(&new);
+	intel_iommu_atomic128_set(&context->val128, new.val128);
 	if (!ecap_coherent(iommu->ecap))
 		clflush_cache_range(context, sizeof(*context));
 	context_present_cache_flush(iommu, did, bus, devfn);
@@ -3771,8 +3771,8 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
+	struct context_entry *context, new = {0};
 	struct intel_iommu *iommu = info->iommu;
-	struct context_entry *context;
 
 	spin_lock(&iommu->lock);
 	context = iommu_context_addr(iommu, bus, devfn, 1);
@@ -3787,17 +3787,17 @@ static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
 	}
 
 	copied_context_tear_down(iommu, context, bus, devfn);
-	context_clear_entry(context);
-	context_set_domain_id(context, FLPT_DEFAULT_DID);
+	context_set_domain_id(&new, FLPT_DEFAULT_DID);
 
 	/*
 	 * In pass through mode, AW must be programmed to indicate the largest
 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
 	 */
-	context_set_address_width(context, iommu->msagaw);
-	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
-	context_set_fault_enable(context);
-	context_set_present(context);
+	context_set_address_width(&new, iommu->msagaw);
+	context_set_translation_type(&new, CONTEXT_TT_PASS_THROUGH);
+	context_set_fault_enable(&new);
+	context_set_present(&new);
+	intel_iommu_atomic128_set(&context->val128, new.val128);
 	if (!ecap_coherent(iommu->ecap))
 		clflush_cache_range(context, sizeof(*context));
 	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index 3e2255057079..298a39183996 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -978,23 +978,23 @@ static int context_entry_set_pasid_table(struct context_entry *context,
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
 	struct pasid_table *table = info->pasid_table;
 	struct intel_iommu *iommu = info->iommu;
+	struct context_entry new = {0};
 	unsigned long pds;
 
-	context_clear_entry(context);
-
 	pds = context_get_sm_pds(table);
-	context->lo = (u64)virt_to_phys(table->table) | context_pdts(pds);
-	context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
+	new.lo = (u64)virt_to_phys(table->table) | context_pdts(pds);
+	context_set_sm_rid2pasid(&new, IOMMU_NO_PASID);
 
 	if (info->ats_supported)
-		context_set_sm_dte(context);
+		context_set_sm_dte(&new);
 	if (info->pasid_supported)
-		context_set_pasid(context);
+		context_set_pasid(&new);
 	if (info->pri_supported)
-		context_set_sm_pre(context);
+		context_set_sm_pre(&new);
 
-	context_set_fault_enable(context);
-	context_set_present(context);
+	context_set_fault_enable(&new);
+	context_set_present(&new);
+	intel_iommu_atomic128_set(&context->val128, new.val128);
 	__iommu_flush_cache(iommu, context, sizeof(*context));
 
 	return 0;
-- 
2.43.0

RE: [PATCH 1/3] iommu/vt-d: Use 128-bit atomic updates for context entries
Posted by Tian, Kevin 3 weeks, 3 days ago
> From: Lu Baolu <baolu.lu@linux.intel.com>
> Sent: Tuesday, January 13, 2026 11:01 AM
> 
> On Intel IOMMU, device context entries are accessed by hardware in
> 128-bit chunks. Currently, the driver updates these entries by
> programming the 'lo' and 'hi' 64-bit fields individually.
> 
> This creates a potential race condition where the IOMMU hardware may
> fetch
> a context entry while the CPU has only completed one of the two 64-bit
> writes. This "torn" entry — consisting of half-old and half-new data —
> could lead to unpredictable hardware behavior, especially when
> transitioning the 'Present' bit or changing translation types.

this is not accurate. context entry is 128bits only. Scalable context
entry is 256bits but only the lower 128bits are defined. so hardware always
fetches the context entry atomically. Then if software ensures the right
order of updates (clear present first then other bits), the hardware won't
look at the partial entry after seeing present=0.

But now as Dmytro reported there is no barrier in place so two 64bits
updates to the context entry might be reordered so hw could fetch
an entry with old lower half (present=1) and new higher half.

then 128bit atomic operation avoids this ordering concern.

> @@ -1170,19 +1170,19 @@ static int domain_context_mapping_one(struct
> dmar_domain *domain,
>  		goto out_unlock;
> 
>  	copied_context_tear_down(iommu, context, bus, devfn);
> -	context_clear_entry(context);
> -	context_set_domain_id(context, did);
> +	context_set_domain_id(&new, did);

I wonder whether it's necessary to use atomic in the attach path, from
fix p.o.v.

The assumption is that the context should have been cleared already
before calling this function (and following ones). Does it make more
sense to check the present bit, warning if set, then fail the operation?

We could refactor them to do atomic update, but then it's for cleanup
instead of being part of a fix.

Then this may be split into three patches:

- change context_clear_entry() to be atomic, to fix the teardown path
- add present bit check in other functions in this patch, to scrutinize the
  attach path
- change those functions to be atomic, as a clean up

Does it make sense?
Re: [PATCH 1/3] iommu/vt-d: Use 128-bit atomic updates for context entries
Posted by Baolu Lu 3 weeks, 2 days ago
On 1/14/26 15:54, Tian, Kevin wrote:
>> From: Lu Baolu <baolu.lu@linux.intel.com>
>> Sent: Tuesday, January 13, 2026 11:01 AM
>>
>> On Intel IOMMU, device context entries are accessed by hardware in
>> 128-bit chunks. Currently, the driver updates these entries by
>> programming the 'lo' and 'hi' 64-bit fields individually.
>>
>> This creates a potential race condition where the IOMMU hardware may
>> fetch
>> a context entry while the CPU has only completed one of the two 64-bit
>> writes. This "torn" entry — consisting of half-old and half-new data —
>> could lead to unpredictable hardware behavior, especially when
>> transitioning the 'Present' bit or changing translation types.
> 
> this is not accurate. context entry is 128bits only. Scalable context
> entry is 256bits but only the lower 128bits are defined. so hardware always
> fetches the context entry atomically. Then if software ensures the right
> order of updates (clear present first then other bits), the hardware won't
> look at the partial entry after seeing present=0.
> 
> But now as Dmytro reported there is no barrier in place so two 64bits
> updates to the context entry might be reordered so hw could fetch
> an entry with old lower half (present=1) and new higher half.
> 
> then 128bit atomic operation avoids this ordering concern.

You're right. I will update the commit message to be more precise. Since
the hardware fetches the 128-bit context entry atomically, the issue is
essentially a software ordering problem.

We considered three approaches to solve this:

- Memory barriers (to enforce Present bit clearing order)
- WRITE_ONCE() (to prevent compiler reordering)
- 128-bit atomic updates

This patch uses the atomic update approach.

> 
>> @@ -1170,19 +1170,19 @@ static int domain_context_mapping_one(struct
>> dmar_domain *domain,
>>   		goto out_unlock;
>>
>>   	copied_context_tear_down(iommu, context, bus, devfn);
>> -	context_clear_entry(context);
>> -	context_set_domain_id(context, did);
>> +	context_set_domain_id(&new, did);
> 
> I wonder whether it's necessary to use atomic in the attach path, from
> fix p.o.v.
> 
> The assumption is that the context should have been cleared already
> before calling this function (and following ones). Does it make more
> sense to check the present bit, warning if set, then fail the operation?
> We could refactor them to do atomic update, but then it's for cleanup> instead of being part of a fix.

Yes. For the attach path, this is a cleanup rather than a fix.

> 
> Then this may be split into three patches:
> 
> - change context_clear_entry() to be atomic, to fix the teardown path
> - add present bit check in other functions in this patch, to scrutinize the
>    attach path
> - change those functions to be atomic, as a clean up

Perhaps this also paves the way for enabling hitless replace in the
attach_dev path?

> Does it make sense?

Yes, it is.

Thanks,
baolu
RE: [PATCH 1/3] iommu/vt-d: Use 128-bit atomic updates for context entries
Posted by Tian, Kevin 3 weeks, 2 days ago
> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Thursday, January 15, 2026 11:27 AM
> 
> On 1/14/26 15:54, Tian, Kevin wrote:
> >
> > Then this may be split into three patches:
> >
> > - change context_clear_entry() to be atomic, to fix the teardown path
> > - add present bit check in other functions in this patch, to scrutinize the
> >    attach path
> > - change those functions to be atomic, as a clean up
> 
> Perhaps this also paves the way for enabling hitless replace in the
> attach_dev path?
> 

I didn't get it. attach/replace are different paths and iommu core will
reject an attach request for a device which is already attached...
Re: [PATCH 1/3] iommu/vt-d: Use 128-bit atomic updates for context entries
Posted by Jason Gunthorpe 3 weeks, 2 days ago
On Thu, Jan 15, 2026 at 05:59:56AM +0000, Tian, Kevin wrote:
> > From: Baolu Lu <baolu.lu@linux.intel.com>
> > Sent: Thursday, January 15, 2026 11:27 AM
> > 
> > On 1/14/26 15:54, Tian, Kevin wrote:
> > >
> > > Then this may be split into three patches:
> > >
> > > - change context_clear_entry() to be atomic, to fix the teardown path
> > > - add present bit check in other functions in this patch, to scrutinize the
> > >    attach path
> > > - change those functions to be atomic, as a clean up
> > 
> > Perhaps this also paves the way for enabling hitless replace in the
> > attach_dev path?
> > 
> 
> I didn't get it. attach/replace are different paths and iommu core will
> reject an attach request for a device which is already attached...

From a driver perspective there should be no such thing as
repalce. vt-d has gone wrong by having special replace flows inside
itself.

Drivers do attach and should try to make it hitless, that's it.

Meaning drivers do the whole attach sequence in a robust way:
 - Manage invalidation lists so that old/new domains continue to
   generate invalidations while transitioning
 - Updating HW data structures without making them non-present
 - Managing ATS enable/disable in the right order

It is alot to think about, but if you follow the ARM sequence it is
all written out there..

So this series should be about the 2nd one, making HW updates go from
something on the stack to something in the HW, and if Balou is going
to use entry_set then use entry_set for *everything* and it will deal
with things like using the 128 bit store or making things non-present
temporarily. Even if the thing has only 128 bits used you may as well
stick with it.

Remember also that the 128 bit store is a CPU optional feature. AMD
investigated and found all their CPUs with IOMMUs have the store
feature so they could use it unconditionally (IIRC they check at
driver probe and fail probe without 128 bit store). VTD needs to do
the same thing, and if 128 bit store is actually sometimes not
available it needs to fall back to the 64 bit entry set, for
eveything.

Jason
RE: [PATCH 1/3] iommu/vt-d: Use 128-bit atomic updates for context entries
Posted by Tian, Kevin 3 weeks, 1 day ago
> From: Jason Gunthorpe <jgg@nvidia.com>
> Sent: Thursday, January 15, 2026 9:23 PM
> 
> On Thu, Jan 15, 2026 at 05:59:56AM +0000, Tian, Kevin wrote:
> > > From: Baolu Lu <baolu.lu@linux.intel.com>
> > > Sent: Thursday, January 15, 2026 11:27 AM
> > >
> > > On 1/14/26 15:54, Tian, Kevin wrote:
> > > >
> > > > Then this may be split into three patches:
> > > >
> > > > - change context_clear_entry() to be atomic, to fix the teardown path
> > > > - add present bit check in other functions in this patch, to scrutinize the
> > > >    attach path
> > > > - change those functions to be atomic, as a clean up
> > >
> > > Perhaps this also paves the way for enabling hitless replace in the
> > > attach_dev path?
> > >
> >
> > I didn't get it. attach/replace are different paths and iommu core will
> > reject an attach request for a device which is already attached...
> 
> From a driver perspective there should be no such thing as
> repalce. vt-d has gone wrong by having special replace flows inside
> itself.
> 
> Drivers do attach and should try to make it hitless, that's it.

Agree

> 
> Meaning drivers do the whole attach sequence in a robust way:
>  - Manage invalidation lists so that old/new domains continue to
>    generate invalidations while transitioning
>  - Updating HW data structures without making them non-present
>  - Managing ATS enable/disable in the right order
> 
> It is alot to think about, but if you follow the ARM sequence it is
> all written out there..
> 
> So this series should be about the 2nd one, making HW updates go from
> something on the stack to something in the HW, and if Balou is going
> to use entry_set then use entry_set for *everything* and it will deal
> with things like using the 128 bit store or making things non-present
> temporarily. Even if the thing has only 128 bits used you may as well
> stick with it.
> 
> Remember also that the 128 bit store is a CPU optional feature. AMD
> investigated and found all their CPUs with IOMMUs have the store
> feature so they could use it unconditionally (IIRC they check at
> driver probe and fail probe without 128 bit store). VTD needs to do
> the same thing, and if 128 bit store is actually sometimes not
> available it needs to fall back to the 64 bit entry set, for
> eveything.
> 

We are checking that... 

btw the way that VT-d spec describes it as "software MUST use
128bit..." highly suggest that it's either as AMD or with exception
only on very early generations. Another exception is VM which
may have hardware supporting 128bit but vcpu model doesn't,
but I doubt it's a meaningful configuration nowadays...
Re: [PATCH 1/3] iommu/vt-d: Use 128-bit atomic updates for context entries
Posted by Jason Gunthorpe 3 weeks, 1 day ago
On Fri, Jan 16, 2026 at 05:19:52AM +0000, Tian, Kevin wrote:
> btw the way that VT-d spec describes it as "software MUST use
> 128bit..." highly suggest that it's either as AMD or with exception
> only on very early generations. Another exception is VM which
> may have hardware supporting 128bit but vcpu model doesn't,
> but I doubt it's a meaningful configuration nowadays...

The VMs is an interesting point..

I think with a bit of fiddling the entry_set mechanism could fall back
to 64 bit operation automatically, and that would handle the
compatability needs. It would be much less hitless in such systems but
who cares?

Jason
Re: [PATCH 1/3] iommu/vt-d: Use 128-bit atomic updates for context entries
Posted by Dmytro Maluka 3 weeks, 3 days ago
On Tue, Jan 13, 2026 at 11:00:46AM +0800, Lu Baolu wrote:
> On Intel IOMMU, device context entries are accessed by hardware in
> 128-bit chunks. Currently, the driver updates these entries by
> programming the 'lo' and 'hi' 64-bit fields individually.
> 
> This creates a potential race condition where the IOMMU hardware may fetch
> a context entry while the CPU has only completed one of the two 64-bit
> writes. This "torn" entry — consisting of half-old and half-new data —
> could lead to unpredictable hardware behavior, especially when
> transitioning the 'Present' bit or changing translation types.
> 
> To ensure the IOMMU hardware always observes a consistent state, use
> 128-bit atomic updates for context entries. This is achieved by building
> context entries on the stack and write them to the table in a single
> operation.
> 
> As this relies on arch_cmpxchg128_local(), restrict INTEL_IOMMU
> dependencies to X86_64.
> 
> Fixes: ba39592764ed2 ("Intel IOMMU: Intel IOMMU driver")
> Reported-by: Dmytro Maluka <dmaluka@chromium.org>

FWIW, Jason and Kevin contributed to this discovery more than I did. :)

> Closes: https://lore.kernel.org/all/aTG7gc7I5wExai3S@google.com/
> Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
> ---
>  drivers/iommu/intel/Kconfig |  2 +-
>  drivers/iommu/intel/iommu.h | 22 ++++++++++++++++++----
>  drivers/iommu/intel/iommu.c | 30 +++++++++++++++---------------
>  drivers/iommu/intel/pasid.c | 18 +++++++++---------
>  4 files changed, 43 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
> index 5471f814e073..efda19820f95 100644
> --- a/drivers/iommu/intel/Kconfig
> +++ b/drivers/iommu/intel/Kconfig
> @@ -11,7 +11,7 @@ config DMAR_DEBUG
>  
>  config INTEL_IOMMU
>  	bool "Support for Intel IOMMU using DMA Remapping Devices"
> -	depends on PCI_MSI && ACPI && X86
> +	depends on PCI_MSI && ACPI && X86_64
>  	select IOMMU_API
>  	select GENERIC_PT
>  	select IOMMU_PT
> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
> index 25c5e22096d4..b8999802f401 100644
> --- a/drivers/iommu/intel/iommu.h
> +++ b/drivers/iommu/intel/iommu.h
> @@ -546,6 +546,16 @@ struct pasid_entry;
>  struct pasid_state_entry;
>  struct page_req_dsc;
>  
> +static __always_inline void intel_iommu_atomic128_set(u128 *ptr, u128 val)
> +{
> +	/*
> +	 * Use the cmpxchg16b instruction for 128-bit atomicity. As updates
> +	 * are serialized by a spinlock, we use the local (unlocked) variant
> +	 * to avoid unnecessary bus locking overhead.
> +	 */
> +	arch_cmpxchg128_local(ptr, *ptr, val);

Any reason why not cmpxchg128_local()? (except following the AMD driver)

Otherwise,

Reviewed-by: Dmytro Maluka <dmaluka@chromium.org>

> +}
> +
>  /*
>   * 0: Present
>   * 1-11: Reserved
> @@ -569,8 +579,13 @@ struct root_entry {
>   * 8-23: domain id
>   */
>  struct context_entry {
> -	u64 lo;
> -	u64 hi;
> +	union {
> +		struct {
> +			u64 lo;
> +			u64 hi;
> +		};
> +		u128 val128;
> +	};
>  };
>  
>  struct iommu_domain_info {
> @@ -946,8 +961,7 @@ static inline int context_domain_id(struct context_entry *c)
>  
>  static inline void context_clear_entry(struct context_entry *context)
>  {
> -	context->lo = 0;
> -	context->hi = 0;
> +	intel_iommu_atomic128_set(&context->val128, 0);
>  }
>  
>  #ifdef CONFIG_INTEL_IOMMU
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index 134302fbcd92..d721061ebda2 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -1147,8 +1147,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
>  			domain_lookup_dev_info(domain, iommu, bus, devfn);
>  	u16 did = domain_id_iommu(domain, iommu);
>  	int translation = CONTEXT_TT_MULTI_LEVEL;
> +	struct context_entry *context, new = {0};
>  	struct pt_iommu_vtdss_hw_info pt_info;
> -	struct context_entry *context;
>  	int ret;
>  
>  	if (WARN_ON(!intel_domain_is_ss_paging(domain)))
> @@ -1170,19 +1170,19 @@ static int domain_context_mapping_one(struct dmar_domain *domain,
>  		goto out_unlock;
>  
>  	copied_context_tear_down(iommu, context, bus, devfn);
> -	context_clear_entry(context);
> -	context_set_domain_id(context, did);
> +	context_set_domain_id(&new, did);
>  
>  	if (info && info->ats_supported)
>  		translation = CONTEXT_TT_DEV_IOTLB;
>  	else
>  		translation = CONTEXT_TT_MULTI_LEVEL;
>  
> -	context_set_address_root(context, pt_info.ssptptr);
> -	context_set_address_width(context, pt_info.aw);
> -	context_set_translation_type(context, translation);
> -	context_set_fault_enable(context);
> -	context_set_present(context);
> +	context_set_address_root(&new, pt_info.ssptptr);
> +	context_set_address_width(&new, pt_info.aw);
> +	context_set_translation_type(&new, translation);
> +	context_set_fault_enable(&new);
> +	context_set_present(&new);
> +	intel_iommu_atomic128_set(&context->val128, new.val128);
>  	if (!ecap_coherent(iommu->ecap))
>  		clflush_cache_range(context, sizeof(*context));
>  	context_present_cache_flush(iommu, did, bus, devfn);
> @@ -3771,8 +3771,8 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
>  static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
>  {
>  	struct device_domain_info *info = dev_iommu_priv_get(dev);
> +	struct context_entry *context, new = {0};
>  	struct intel_iommu *iommu = info->iommu;
> -	struct context_entry *context;
>  
>  	spin_lock(&iommu->lock);
>  	context = iommu_context_addr(iommu, bus, devfn, 1);
> @@ -3787,17 +3787,17 @@ static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
>  	}
>  
>  	copied_context_tear_down(iommu, context, bus, devfn);
> -	context_clear_entry(context);
> -	context_set_domain_id(context, FLPT_DEFAULT_DID);
> +	context_set_domain_id(&new, FLPT_DEFAULT_DID);
>  
>  	/*
>  	 * In pass through mode, AW must be programmed to indicate the largest
>  	 * AGAW value supported by hardware. And ASR is ignored by hardware.
>  	 */
> -	context_set_address_width(context, iommu->msagaw);
> -	context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
> -	context_set_fault_enable(context);
> -	context_set_present(context);
> +	context_set_address_width(&new, iommu->msagaw);
> +	context_set_translation_type(&new, CONTEXT_TT_PASS_THROUGH);
> +	context_set_fault_enable(&new);
> +	context_set_present(&new);
> +	intel_iommu_atomic128_set(&context->val128, new.val128);
>  	if (!ecap_coherent(iommu->ecap))
>  		clflush_cache_range(context, sizeof(*context));
>  	context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
> diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
> index 3e2255057079..298a39183996 100644
> --- a/drivers/iommu/intel/pasid.c
> +++ b/drivers/iommu/intel/pasid.c
> @@ -978,23 +978,23 @@ static int context_entry_set_pasid_table(struct context_entry *context,
>  	struct device_domain_info *info = dev_iommu_priv_get(dev);
>  	struct pasid_table *table = info->pasid_table;
>  	struct intel_iommu *iommu = info->iommu;
> +	struct context_entry new = {0};
>  	unsigned long pds;
>  
> -	context_clear_entry(context);
> -
>  	pds = context_get_sm_pds(table);
> -	context->lo = (u64)virt_to_phys(table->table) | context_pdts(pds);
> -	context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
> +	new.lo = (u64)virt_to_phys(table->table) | context_pdts(pds);
> +	context_set_sm_rid2pasid(&new, IOMMU_NO_PASID);
>  
>  	if (info->ats_supported)
> -		context_set_sm_dte(context);
> +		context_set_sm_dte(&new);
>  	if (info->pasid_supported)
> -		context_set_pasid(context);
> +		context_set_pasid(&new);
>  	if (info->pri_supported)
> -		context_set_sm_pre(context);
> +		context_set_sm_pre(&new);
>  
> -	context_set_fault_enable(context);
> -	context_set_present(context);
> +	context_set_fault_enable(&new);
> +	context_set_present(&new);
> +	intel_iommu_atomic128_set(&context->val128, new.val128);
>  	__iommu_flush_cache(iommu, context, sizeof(*context));
>  
>  	return 0;
> -- 
> 2.43.0
> 
Re: [PATCH 1/3] iommu/vt-d: Use 128-bit atomic updates for context entries
Posted by Baolu Lu 3 weeks, 3 days ago
On 1/14/26 03:27, Dmytro Maluka wrote:
> On Tue, Jan 13, 2026 at 11:00:46AM +0800, Lu Baolu wrote:
>> On Intel IOMMU, device context entries are accessed by hardware in
>> 128-bit chunks. Currently, the driver updates these entries by
>> programming the 'lo' and 'hi' 64-bit fields individually.
>>
>> This creates a potential race condition where the IOMMU hardware may fetch
>> a context entry while the CPU has only completed one of the two 64-bit
>> writes. This "torn" entry — consisting of half-old and half-new data —
>> could lead to unpredictable hardware behavior, especially when
>> transitioning the 'Present' bit or changing translation types.
>>
>> To ensure the IOMMU hardware always observes a consistent state, use
>> 128-bit atomic updates for context entries. This is achieved by building
>> context entries on the stack and write them to the table in a single
>> operation.
>>
>> As this relies on arch_cmpxchg128_local(), restrict INTEL_IOMMU
>> dependencies to X86_64.
>>
>> Fixes: ba39592764ed2 ("Intel IOMMU: Intel IOMMU driver")
>> Reported-by: Dmytro Maluka<dmaluka@chromium.org>
> FWIW, Jason and Kevin contributed to this discovery more than I did. 🙂

Thanks to all you guys.

> 
>> Closes:https://lore.kernel.org/all/aTG7gc7I5wExai3S@google.com/
>> Signed-off-by: Lu Baolu<baolu.lu@linux.intel.com>
>> ---
>>   drivers/iommu/intel/Kconfig |  2 +-
>>   drivers/iommu/intel/iommu.h | 22 ++++++++++++++++++----
>>   drivers/iommu/intel/iommu.c | 30 +++++++++++++++---------------
>>   drivers/iommu/intel/pasid.c | 18 +++++++++---------
>>   4 files changed, 43 insertions(+), 29 deletions(-)
>>
>> diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
>> index 5471f814e073..efda19820f95 100644
>> --- a/drivers/iommu/intel/Kconfig
>> +++ b/drivers/iommu/intel/Kconfig
>> @@ -11,7 +11,7 @@ config DMAR_DEBUG
>>   
>>   config INTEL_IOMMU
>>   	bool "Support for Intel IOMMU using DMA Remapping Devices"
>> -	depends on PCI_MSI && ACPI && X86
>> +	depends on PCI_MSI && ACPI && X86_64
>>   	select IOMMU_API
>>   	select GENERIC_PT
>>   	select IOMMU_PT
>> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
>> index 25c5e22096d4..b8999802f401 100644
>> --- a/drivers/iommu/intel/iommu.h
>> +++ b/drivers/iommu/intel/iommu.h
>> @@ -546,6 +546,16 @@ struct pasid_entry;
>>   struct pasid_state_entry;
>>   struct page_req_dsc;
>>   
>> +static __always_inline void intel_iommu_atomic128_set(u128 *ptr, u128 val)
>> +{
>> +	/*
>> +	 * Use the cmpxchg16b instruction for 128-bit atomicity. As updates
>> +	 * are serialized by a spinlock, we use the local (unlocked) variant
>> +	 * to avoid unnecessary bus locking overhead.
>> +	 */
>> +	arch_cmpxchg128_local(ptr, *ptr, val);
> Any reason why not cmpxchg128_local()? (except following the AMD driver)

Yes. This follows the AMD IOMMU driver. Both drivers use spin lock to
synchronize the update of table entries. They only need the atomicity of
the 128-bit instruction itself. So arch_cmpxchg128_local() works.

> 
> Otherwise,
> 
> Reviewed-by: Dmytro Maluka<dmaluka@chromium.org>

Thanks,
baolu
Re: [PATCH 1/3] iommu/vt-d: Use 128-bit atomic updates for context entries
Posted by Dmytro Maluka 3 weeks, 3 days ago
On Wed, Jan 14, 2026 at 01:14:36PM +0800, Baolu Lu wrote:
> On 1/14/26 03:27, Dmytro Maluka wrote:
> > On Tue, Jan 13, 2026 at 11:00:46AM +0800, Lu Baolu wrote:
> > > +static __always_inline void intel_iommu_atomic128_set(u128 *ptr, u128 val)
> > > +{
> > > +	/*
> > > +	 * Use the cmpxchg16b instruction for 128-bit atomicity. As updates
> > > +	 * are serialized by a spinlock, we use the local (unlocked) variant
> > > +	 * to avoid unnecessary bus locking overhead.
> > > +	 */
> > > +	arch_cmpxchg128_local(ptr, *ptr, val);
> > Any reason why not cmpxchg128_local()? (except following the AMD driver)
> 
> Yes. This follows the AMD IOMMU driver. Both drivers use spin lock to
> synchronize the update of table entries. They only need the atomicity of
> the 128-bit instruction itself. So arch_cmpxchg128_local() works.

Yeah, but my question was merely: why use the raw arch_*() version, not
cmpxchg128_local() which is the same but also includes optional
kasan/kcsan instrumentation:

#define cmpxchg128_local(ptr, ...) \
({ \
	typeof(ptr) __ai_ptr = (ptr); \
	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
	raw_cmpxchg128_local(__ai_ptr, __VA_ARGS__); \
})

IOW, why bypass this instrumentation?
Re: [PATCH 1/3] iommu/vt-d: Use 128-bit atomic updates for context entries
Posted by Baolu Lu 3 weeks, 2 days ago
On 1/14/26 18:55, Dmytro Maluka wrote:
> On Wed, Jan 14, 2026 at 01:14:36PM +0800, Baolu Lu wrote:
>> On 1/14/26 03:27, Dmytro Maluka wrote:
>>> On Tue, Jan 13, 2026 at 11:00:46AM +0800, Lu Baolu wrote:
>>>> +static __always_inline void intel_iommu_atomic128_set(u128 *ptr, u128 val)
>>>> +{
>>>> +	/*
>>>> +	 * Use the cmpxchg16b instruction for 128-bit atomicity. As updates
>>>> +	 * are serialized by a spinlock, we use the local (unlocked) variant
>>>> +	 * to avoid unnecessary bus locking overhead.
>>>> +	 */
>>>> +	arch_cmpxchg128_local(ptr, *ptr, val);
>>> Any reason why not cmpxchg128_local()? (except following the AMD driver)
>>
>> Yes. This follows the AMD IOMMU driver. Both drivers use spin lock to
>> synchronize the update of table entries. They only need the atomicity of
>> the 128-bit instruction itself. So arch_cmpxchg128_local() works.
> 
> Yeah, but my question was merely: why use the raw arch_*() version, not
> cmpxchg128_local() which is the same but also includes optional
> kasan/kcsan instrumentation:
> 
> #define cmpxchg128_local(ptr, ...) \
> ({ \
> 	typeof(ptr) __ai_ptr = (ptr); \
> 	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
> 	raw_cmpxchg128_local(__ai_ptr, __VA_ARGS__); \
> })
> 
> IOW, why bypass this instrumentation?

You are right. There is no strong technical reason to bypass the kasan/
kcsan instrumentation here. My use of the arch_ version was primarily
following the existing pattern in the AMD driver, likely under the
assumption that the spinlock provided sufficient synchronization.

That said, Jason has suggested the generic entry_sync library to handle
these types of multi-quanta updates across different IOMMU drivers. I
plan to adopt that in the next version.

Thanks,
baolu
Re: [PATCH 1/3] iommu/vt-d: Use 128-bit atomic updates for context entries
Posted by Jason Gunthorpe 3 weeks, 2 days ago
On Thu, Jan 15, 2026 at 10:26:16AM +0800, Baolu Lu wrote:
> On 1/14/26 18:55, Dmytro Maluka wrote:
> > On Wed, Jan 14, 2026 at 01:14:36PM +0800, Baolu Lu wrote:
> > > On 1/14/26 03:27, Dmytro Maluka wrote:
> > > > On Tue, Jan 13, 2026 at 11:00:46AM +0800, Lu Baolu wrote:
> > > > > +static __always_inline void intel_iommu_atomic128_set(u128 *ptr, u128 val)
> > > > > +{
> > > > > +	/*
> > > > > +	 * Use the cmpxchg16b instruction for 128-bit atomicity. As updates
> > > > > +	 * are serialized by a spinlock, we use the local (unlocked) variant
> > > > > +	 * to avoid unnecessary bus locking overhead.
> > > > > +	 */
> > > > > +	arch_cmpxchg128_local(ptr, *ptr, val);
> > > > Any reason why not cmpxchg128_local()? (except following the AMD driver)
> > > 
> > > Yes. This follows the AMD IOMMU driver. Both drivers use spin lock to
> > > synchronize the update of table entries. They only need the atomicity of
> > > the 128-bit instruction itself. So arch_cmpxchg128_local() works.
> > 
> > Yeah, but my question was merely: why use the raw arch_*() version, not
> > cmpxchg128_local() which is the same but also includes optional
> > kasan/kcsan instrumentation:
> > 
> > #define cmpxchg128_local(ptr, ...) \
> > ({ \
> > 	typeof(ptr) __ai_ptr = (ptr); \
> > 	instrument_atomic_read_write(__ai_ptr, sizeof(*__ai_ptr)); \
> > 	raw_cmpxchg128_local(__ai_ptr, __VA_ARGS__); \
> > })
> > 
> > IOW, why bypass this instrumentation?
> 
> You are right. There is no strong technical reason to bypass the kasan/
> kcsan instrumentation here. My use of the arch_ version was primarily
> following the existing pattern in the AMD driver, likely under the
> assumption that the spinlock provided sufficient synchronization.
> 
> That said, Jason has suggested the generic entry_sync library to handle
> these types of multi-quanta updates across different IOMMU drivers. I
> plan to adopt that in the next version.

I also copied the amd driver in my draft so it should be changed to
this I think?

Jason