[v3] Consolidate domain cache invalidation

[PATCH v3 02/12] iommu/vt-d: Add cache tag invalidation helpers

Posted by Lu Baolu 1 year, 9 months ago

Add several helpers to invalidate the caches after mappings in the
affected domain are changed.

- cache_tag_flush_range() invalidates a range of caches after mappings
  within this range are changed. It uses the page-selective cache
  invalidation methods.

- cache_tag_flush_all() invalidates all caches tagged by a domain ID.
  It uses the domain-selective cache invalidation methods.

- cache_tag_flush_range_np() invalidates a range of caches when new
  mappings are created in the domain and the corresponding page table
  entries change from non-present to present.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 drivers/iommu/intel/iommu.h |  14 +++
 drivers/iommu/intel/cache.c | 195 ++++++++++++++++++++++++++++++++++++
 drivers/iommu/intel/iommu.c |  12 ---
 3 files changed, 209 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 52471f5337d5..e17683ecef4b 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -35,6 +35,8 @@
 #define VTD_PAGE_MASK		(((u64)-1) << VTD_PAGE_SHIFT)
 #define VTD_PAGE_ALIGN(addr)	(((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
 
+#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
+
 #define VTD_STRIDE_SHIFT        (9)
 #define VTD_STRIDE_MASK         (((u64)-1) << VTD_STRIDE_SHIFT)
 
@@ -1041,6 +1043,13 @@ static inline void context_set_sm_pre(struct context_entry *context)
 	context->lo |= BIT_ULL(4);
 }
 
+/* Returns a number of VTD pages, but aligned to MM page size */
+static inline unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
+{
+	host_addr &= ~PAGE_MASK;
+	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
+}
+
 /* Convert value to context PASID directory size field coding. */
 #define context_pdts(pds)	(((pds) & 0x7) << 9)
 
@@ -1122,6 +1131,11 @@ int cache_tag_assign_domain(struct dmar_domain *domain,
 			    struct device *dev, ioasid_t pasid);
 void cache_tag_unassign_domain(struct dmar_domain *domain,
 			       struct device *dev, ioasid_t pasid);
+void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start,
+			   unsigned long end, int ih);
+void cache_tag_flush_all(struct dmar_domain *domain);
+void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start,
+			      unsigned long end);
 
 #ifdef CONFIG_INTEL_IOMMU_SVM
 void intel_svm_check(struct intel_iommu *iommu);
diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c
index 296f1645a739..0539275a9d20 100644
--- a/drivers/iommu/intel/cache.c
+++ b/drivers/iommu/intel/cache.c
@@ -12,6 +12,7 @@
 #include <linux/dmar.h>
 #include <linux/iommu.h>
 #include <linux/memory.h>
+#include <linux/pci.h>
 #include <linux/spinlock.h>
 
 #include "iommu.h"
@@ -212,3 +213,197 @@ void cache_tag_unassign_domain(struct dmar_domain *domain,
 	if (domain->domain.type == IOMMU_DOMAIN_NESTED)
 		__cache_tag_unassign_parent_domain(domain->s2_domain, did, dev, pasid);
 }
+
+static unsigned long calculate_psi_aligned_address(unsigned long start,
+						   unsigned long end,
+						   unsigned long *_pages,
+						   unsigned long *_mask)
+{
+	unsigned long pages = aligned_nrpages(start, end - start + 1);
+	unsigned long aligned_pages = __roundup_pow_of_two(pages);
+	unsigned long bitmask = aligned_pages - 1;
+	unsigned long mask = ilog2(aligned_pages);
+	unsigned long pfn = IOVA_PFN(start);
+
+	/*
+	 * PSI masks the low order bits of the base address. If the
+	 * address isn't aligned to the mask, then compute a mask value
+	 * needed to ensure the target range is flushed.
+	 */
+	if (unlikely(bitmask & pfn)) {
+		unsigned long end_pfn = pfn + pages - 1, shared_bits;
+
+		/*
+		 * Since end_pfn <= pfn + bitmask, the only way bits
+		 * higher than bitmask can differ in pfn and end_pfn is
+		 * by carrying. This means after masking out bitmask,
+		 * high bits starting with the first set bit in
+		 * shared_bits are all equal in both pfn and end_pfn.
+		 */
+		shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
+		mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
+	}
+
+	*_pages = aligned_pages;
+	*_mask = mask;
+
+	return ALIGN_DOWN(start, VTD_PAGE_SIZE << mask);
+}
+
+/*
+ * Invalidates a range of IOVA from @start (inclusive) to @end (inclusive)
+ * when the memory mappings in the target domain have been modified.
+ */
+void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start,
+			   unsigned long end, int ih)
+{
+	unsigned long pages, mask, addr;
+	struct cache_tag *tag;
+	unsigned long flags;
+
+	addr = calculate_psi_aligned_address(start, end, &pages, &mask);
+
+	spin_lock_irqsave(&domain->cache_lock, flags);
+	list_for_each_entry(tag, &domain->cache_tags, node) {
+		struct intel_iommu *iommu = tag->iommu;
+		struct device_domain_info *info;
+		u16 sid;
+
+		switch (tag->type) {
+		case CACHE_TAG_IOTLB:
+		case CACHE_TAG_NESTING_IOTLB:
+			if (domain->use_first_level) {
+				qi_flush_piotlb(iommu, tag->domain_id,
+						tag->pasid, addr, pages, ih);
+			} else {
+				/*
+				 * Fallback to domain selective flush if no
+				 * PSI support or the size is too big.
+				 */
+				if (!cap_pgsel_inv(iommu->cap) ||
+				    mask > cap_max_amask_val(iommu->cap))
+					iommu->flush.flush_iotlb(iommu, tag->domain_id,
+								 0, 0, DMA_TLB_DSI_FLUSH);
+				else
+					iommu->flush.flush_iotlb(iommu, tag->domain_id,
+								 addr | ih, mask,
+								 DMA_TLB_PSI_FLUSH);
+			}
+			break;
+		case CACHE_TAG_NESTING_DEVTLB:
+			/*
+			 * Address translation cache in device side caches the
+			 * result of nested translation. There is no easy way
+			 * to identify the exact set of nested translations
+			 * affected by a change in S2. So just flush the entire
+			 * device cache.
+			 */
+			addr = 0;
+			mask = MAX_AGAW_PFN_WIDTH;
+			fallthrough;
+		case CACHE_TAG_DEVTLB:
+			info = dev_iommu_priv_get(tag->dev);
+			sid = PCI_DEVID(info->bus, info->devfn);
+
+			if (tag->pasid == IOMMU_NO_PASID)
+				qi_flush_dev_iotlb(iommu, sid, info->pfsid,
+						   info->ats_qdep, addr, mask);
+			else
+				qi_flush_dev_iotlb_pasid(iommu, sid, info->pfsid,
+							 tag->pasid, info->ats_qdep,
+							 addr, mask);
+
+			quirk_extra_dev_tlb_flush(info, addr, mask, tag->pasid, info->ats_qdep);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&domain->cache_lock, flags);
+}
+
+/*
+ * Invalidates all ranges of IOVA when the memory mappings in the target
+ * domain have been modified.
+ */
+void cache_tag_flush_all(struct dmar_domain *domain)
+{
+	struct cache_tag *tag;
+	unsigned long flags;
+
+	spin_lock_irqsave(&domain->cache_lock, flags);
+	list_for_each_entry(tag, &domain->cache_tags, node) {
+		struct intel_iommu *iommu = tag->iommu;
+		struct device_domain_info *info;
+		u16 sid;
+
+		switch (tag->type) {
+		case CACHE_TAG_IOTLB:
+		case CACHE_TAG_NESTING_IOTLB:
+			if (domain->use_first_level)
+				qi_flush_piotlb(iommu, tag->domain_id,
+						tag->pasid, 0, -1, 0);
+			else
+				iommu->flush.flush_iotlb(iommu, tag->domain_id,
+							 0, 0, DMA_TLB_DSI_FLUSH);
+			break;
+		case CACHE_TAG_DEVTLB:
+		case CACHE_TAG_NESTING_DEVTLB:
+			info = dev_iommu_priv_get(tag->dev);
+			sid = PCI_DEVID(info->bus, info->devfn);
+
+			qi_flush_dev_iotlb(iommu, sid, info->pfsid, info->ats_qdep,
+					   0, MAX_AGAW_PFN_WIDTH);
+			quirk_extra_dev_tlb_flush(info, 0, MAX_AGAW_PFN_WIDTH,
+						  IOMMU_NO_PASID, info->ats_qdep);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&domain->cache_lock, flags);
+}
+
+/*
+ * Invalidate a range of IOVA when new mappings are created in the target
+ * domain.
+ *
+ * - VT-d spec, Section 6.1 Caching Mode: When the CM field is reported as
+ *   Set, any software updates to remapping structures other than first-
+ *   stage mapping requires explicit invalidation of the caches.
+ * - VT-d spec, Section 6.8 Write Buffer Flushing: For hardware that requires
+ *   write buffer flushing, software must explicitly perform write-buffer
+ *   flushing, if cache invalidation is not required.
+ */
+void cache_tag_flush_range_np(struct dmar_domain *domain, unsigned long start,
+			      unsigned long end)
+{
+	unsigned long pages, mask, addr;
+	struct cache_tag *tag;
+	unsigned long flags;
+
+	addr = calculate_psi_aligned_address(start, end, &pages, &mask);
+
+	spin_lock_irqsave(&domain->cache_lock, flags);
+	list_for_each_entry(tag, &domain->cache_tags, node) {
+		struct intel_iommu *iommu = tag->iommu;
+
+		if (!cap_caching_mode(iommu->cap) || domain->use_first_level) {
+			iommu_flush_write_buffer(iommu);
+			continue;
+		}
+
+		if (tag->type == CACHE_TAG_IOTLB ||
+		    tag->type == CACHE_TAG_NESTING_IOTLB) {
+			/*
+			 * Fallback to domain selective flush if no
+			 * PSI support or the size is too big.
+			 */
+			if (!cap_pgsel_inv(iommu->cap) ||
+			    mask > cap_max_amask_val(iommu->cap))
+				iommu->flush.flush_iotlb(iommu, tag->domain_id,
+							 0, 0, DMA_TLB_DSI_FLUSH);
+			else
+				iommu->flush.flush_iotlb(iommu, tag->domain_id,
+							 addr, mask,
+							 DMA_TLB_PSI_FLUSH);
+		}
+	}
+	spin_unlock_irqrestore(&domain->cache_lock, flags);
+}
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 0c0b8e493fda..ac413097058c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -54,11 +54,6 @@
 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
 
-/* IO virtual address start page frame number */
-#define IOVA_START_PFN		(1)
-
-#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
-
 static void __init check_tylersburg_isoch(void);
 static int rwbf_quirk;
 
@@ -1991,13 +1986,6 @@ domain_context_mapping(struct dmar_domain *domain, struct device *dev)
 				      domain_context_mapping_cb, domain);
 }
 
-/* Returns a number of VTD pages, but aligned to MM page size */
-static unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
-{
-	host_addr &= ~PAGE_MASK;
-	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
-}
-
 /* Return largest possible superpage level for a given mapping */
 static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
 				   unsigned long phy_pfn, unsigned long pages)
-- 
2.34.1

Re: [PATCH v3 02/12] iommu/vt-d: Add cache tag invalidation helpers

Posted by Baolu Lu 1 year, 9 months ago

On 4/16/24 4:06 PM, Lu Baolu wrote:
> Add several helpers to invalidate the caches after mappings in the
> affected domain are changed.
> 
> - cache_tag_flush_range() invalidates a range of caches after mappings
>    within this range are changed. It uses the page-selective cache
>    invalidation methods.
> 
> - cache_tag_flush_all() invalidates all caches tagged by a domain ID.
>    It uses the domain-selective cache invalidation methods.
> 
> - cache_tag_flush_range_np() invalidates a range of caches when new
>    mappings are created in the domain and the corresponding page table
>    entries change from non-present to present.
> 
> Signed-off-by: Lu Baolu<baolu.lu@linux.intel.com>
> ---
>   drivers/iommu/intel/iommu.h |  14 +++
>   drivers/iommu/intel/cache.c | 195 ++++++++++++++++++++++++++++++++++++
>   drivers/iommu/intel/iommu.c |  12 ---
>   3 files changed, 209 insertions(+), 12 deletions(-)

[...]

> +
> +/*
> + * Invalidates a range of IOVA from @start (inclusive) to @end (inclusive)
> + * when the memory mappings in the target domain have been modified.
> + */
> +void cache_tag_flush_range(struct dmar_domain *domain, unsigned long start,
> +			   unsigned long end, int ih)
> +{
> +	unsigned long pages, mask, addr;
> +	struct cache_tag *tag;
> +	unsigned long flags;
> +
> +	addr = calculate_psi_aligned_address(start, end, &pages, &mask);
> +
> +	spin_lock_irqsave(&domain->cache_lock, flags);
> +	list_for_each_entry(tag, &domain->cache_tags, node) {
> +		struct intel_iommu *iommu = tag->iommu;
> +		struct device_domain_info *info;
> +		u16 sid;
> +
> +		switch (tag->type) {
> +		case CACHE_TAG_IOTLB:
> +		case CACHE_TAG_NESTING_IOTLB:
> +			if (domain->use_first_level) {
> +				qi_flush_piotlb(iommu, tag->domain_id,
> +						tag->pasid, addr, pages, ih);
> +			} else {
> +				/*
> +				 * Fallback to domain selective flush if no
> +				 * PSI support or the size is too big.
> +				 */
> +				if (!cap_pgsel_inv(iommu->cap) ||
> +				    mask > cap_max_amask_val(iommu->cap))
> +					iommu->flush.flush_iotlb(iommu, tag->domain_id,
> +								 0, 0, DMA_TLB_DSI_FLUSH);
> +				else
> +					iommu->flush.flush_iotlb(iommu, tag->domain_id,
> +								 addr | ih, mask,
> +								 DMA_TLB_PSI_FLUSH);
> +			}
> +			break;
> +		case CACHE_TAG_NESTING_DEVTLB:
> +			/*
> +			 * Address translation cache in device side caches the
> +			 * result of nested translation. There is no easy way
> +			 * to identify the exact set of nested translations
> +			 * affected by a change in S2. So just flush the entire
> +			 * device cache.
> +			 */
> +			addr = 0;
> +			mask = MAX_AGAW_PFN_WIDTH;
> +			fallthrough;

I realized that the logic above is not right. Setting both @addr and
@mask to 0 doesn't means flush all caches on the device. I will change
it like below:

diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c
index e8418cdd8331..18debb82272a 100644
--- a/drivers/iommu/intel/cache.c
+++ b/drivers/iommu/intel/cache.c
@@ -302,9 +302,14 @@ void cache_tag_flush_range(struct dmar_domain 
*domain, unsigned long start,
                          * affected by a change in S2. So just flush 
the entire
                          * device cache.
                          */
-                       addr = 0;
-                       mask = MAX_AGAW_PFN_WIDTH;
-                       fallthrough;
+                       info = dev_iommu_priv_get(tag->dev);
+                       sid = PCI_DEVID(info->bus, info->devfn);
+
+                       qi_flush_dev_iotlb(iommu, sid, info->pfsid, 
info->ats_qdep,
+                                          0, MAX_AGAW_PFN_WIDTH);
+                       quirk_extra_dev_tlb_flush(info, 0, 
MAX_AGAW_PFN_WIDTH,
+                                                 IOMMU_NO_PASID, 
info->ats_qdep);
+                       break;
                 case CACHE_TAG_DEVTLB:
                         info = dev_iommu_priv_get(tag->dev);
                         sid = PCI_DEVID(info->bus, info->devfn);

> +		case CACHE_TAG_DEVTLB:
> +			info = dev_iommu_priv_get(tag->dev);
> +			sid = PCI_DEVID(info->bus, info->devfn);
> +
> +			if (tag->pasid == IOMMU_NO_PASID)
> +				qi_flush_dev_iotlb(iommu, sid, info->pfsid,
> +						   info->ats_qdep, addr, mask);
> +			else
> +				qi_flush_dev_iotlb_pasid(iommu, sid, info->pfsid,
> +							 tag->pasid, info->ats_qdep,
> +							 addr, mask);
> +
> +			quirk_extra_dev_tlb_flush(info, addr, mask, tag->pasid, info->ats_qdep);
> +			break;
> +		}
> +	}
> +	spin_unlock_irqrestore(&domain->cache_lock, flags);

Best regards,
baolu

RE: [PATCH v3 02/12] iommu/vt-d: Add cache tag invalidation helpers

Posted by Tian, Kevin 1 year, 9 months ago

> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Monday, April 22, 2024 1:30 PM
> 
> On 4/16/24 4:06 PM, Lu Baolu wrote:
> > +		case CACHE_TAG_NESTING_DEVTLB:
> > +			/*
> > +			 * Address translation cache in device side caches the
> > +			 * result of nested translation. There is no easy way
> > +			 * to identify the exact set of nested translations
> > +			 * affected by a change in S2. So just flush the entire
> > +			 * device cache.
> > +			 */
> > +			addr = 0;
> > +			mask = MAX_AGAW_PFN_WIDTH;
> > +			fallthrough;
> 
> I realized that the logic above is not right. Setting both @addr and
> @mask to 0 doesn't means flush all caches on the device. I will change
> it like below:

I didn't get. Above code doesn't set @mask to 0.

> 
> diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c
> index e8418cdd8331..18debb82272a 100644
> --- a/drivers/iommu/intel/cache.c
> +++ b/drivers/iommu/intel/cache.c
> @@ -302,9 +302,14 @@ void cache_tag_flush_range(struct dmar_domain
> *domain, unsigned long start,
>                           * affected by a change in S2. So just flush
> the entire
>                           * device cache.
>                           */
> -                       addr = 0;
> -                       mask = MAX_AGAW_PFN_WIDTH;
> -                       fallthrough;
> +                       info = dev_iommu_priv_get(tag->dev);
> +                       sid = PCI_DEVID(info->bus, info->devfn);
> +
> +                       qi_flush_dev_iotlb(iommu, sid, info->pfsid,
> info->ats_qdep,
> +                                          0, MAX_AGAW_PFN_WIDTH);
> +                       quirk_extra_dev_tlb_flush(info, 0,
> MAX_AGAW_PFN_WIDTH,
> +                                                 IOMMU_NO_PASID,
> info->ats_qdep);
> +                       break;

and I didn't get this change. It goes backward by ignoring tag->pasid.

what's the exact problem of the fallthrough logic in original code?

>                  case CACHE_TAG_DEVTLB:
>                          info = dev_iommu_priv_get(tag->dev);
>                          sid = PCI_DEVID(info->bus, info->devfn);
> 
> > +		case CACHE_TAG_DEVTLB:
> > +			info = dev_iommu_priv_get(tag->dev);
> > +			sid = PCI_DEVID(info->bus, info->devfn);
> > +
> > +			if (tag->pasid == IOMMU_NO_PASID)
> > +				qi_flush_dev_iotlb(iommu, sid, info->pfsid,
> > +						   info->ats_qdep, addr,
> mask);
> > +			else
> > +				qi_flush_dev_iotlb_pasid(iommu, sid, info-
> >pfsid,
> > +							 tag->pasid, info-
> >ats_qdep,
> > +							 addr, mask);
> > +
> > +			quirk_extra_dev_tlb_flush(info, addr, mask, tag-
> >pasid, info->ats_qdep);
> > +			break;
> > +		}
> > +	}
> > +	spin_unlock_irqrestore(&domain->cache_lock, flags);
> 
> Best regards,
> baolu

Re: [PATCH v3 02/12] iommu/vt-d: Add cache tag invalidation helpers

Posted by Baolu Lu 1 year, 9 months ago

On 4/23/24 4:42 PM, Tian, Kevin wrote:
>> From: Baolu Lu<baolu.lu@linux.intel.com>
>> Sent: Monday, April 22, 2024 1:30 PM
>>
>> On 4/16/24 4:06 PM, Lu Baolu wrote:
>>> +		case CACHE_TAG_NESTING_DEVTLB:
>>> +			/*
>>> +			 * Address translation cache in device side caches the
>>> +			 * result of nested translation. There is no easy way
>>> +			 * to identify the exact set of nested translations
>>> +			 * affected by a change in S2. So just flush the entire
>>> +			 * device cache.
>>> +			 */
>>> +			addr = 0;
>>> +			mask = MAX_AGAW_PFN_WIDTH;
>>> +			fallthrough;
>> I realized that the logic above is not right. Setting both @addr and
>> @mask to 0 doesn't means flush all caches on the device. I will change
>> it like below:
> I didn't get. Above code doesn't set @mask to 0.

Oh!? I have no idea why I read that as "mask = 0" now. Perhaps my brain
was on vacation earlier. :-)

> 
>> diff --git a/drivers/iommu/intel/cache.c b/drivers/iommu/intel/cache.c
>> index e8418cdd8331..18debb82272a 100644
>> --- a/drivers/iommu/intel/cache.c
>> +++ b/drivers/iommu/intel/cache.c
>> @@ -302,9 +302,14 @@ void cache_tag_flush_range(struct dmar_domain
>> *domain, unsigned long start,
>>                            * affected by a change in S2. So just flush
>> the entire
>>                            * device cache.
>>                            */
>> -                       addr = 0;
>> -                       mask = MAX_AGAW_PFN_WIDTH;
>> -                       fallthrough;
>> +                       info = dev_iommu_priv_get(tag->dev);
>> +                       sid = PCI_DEVID(info->bus, info->devfn);
>> +
>> +                       qi_flush_dev_iotlb(iommu, sid, info->pfsid,
>> info->ats_qdep,
>> +                                          0, MAX_AGAW_PFN_WIDTH);
>> +                       quirk_extra_dev_tlb_flush(info, 0,
>> MAX_AGAW_PFN_WIDTH,
>> +                                                 IOMMU_NO_PASID,
>> info->ats_qdep);
>> +                       break;
> and I didn't get this change. It goes backward by ignoring tag->pasid.
> 
> what's the exact problem of the fallthrough logic in original code?

Sorry! Please ignore this.

Best regards,
baolu