iommu/vt-d: Refactor PCI PRI enabling/disabling callbacks

[PATCH] iommu/vt-d: Refactor PCI PRI enabling/disabling callbacks

Posted by Lu Baolu 1 year, 8 months ago

Commit 0095bf83554f8 ("iommu: Improve iopf_queue_remove_device()")
specified the flow for disabling the PRI on a device. Refactor the
PRI callbacks in the intel iommu driver to better manage PRI
enabling and disabling and align it with the device queue interfaces
in the iommu core.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
---
 drivers/iommu/intel/iommu.h |  9 +++++
 drivers/iommu/intel/iommu.c | 77 ++++++++++++++++++++++++++++++++++---
 drivers/iommu/intel/pasid.c |  2 -
 3 files changed, 81 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index eaf015b4353b..3d5d8f786906 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -1047,6 +1047,15 @@ static inline void context_set_sm_pre(struct context_entry *context)
 	context->lo |= BIT_ULL(4);
 }
 
+/*
+ * Clear the PRE(Page Request Enable) field of a scalable mode context
+ * entry.
+ */
+static inline void context_clear_sm_pre(struct context_entry *context)
+{
+	context->lo &= ~BIT_ULL(4);
+}
+
 /* Returns a number of VTD pages, but aligned to MM page size */
 static inline unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
 {
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 2e9811bf2a4e..2d4b122bcc1c 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4213,6 +4213,57 @@ static int intel_iommu_enable_sva(struct device *dev)
 	return 0;
 }
 
+/*
+ * Invalidate the caches for a present-to-present change in a context
+ * table entry according to the Spec 6.5.3.3 (Guidance to Software for
+ * Invalidations).
+ *
+ * Since context entry is not encoded by domain-id when operating in
+ * scalable-mode (refer Section 6.2.1), this performs coarser
+ * invalidation than the domain-selective granularity requested.
+ */
+static void invalidate_present_context_change(struct device_domain_info *info)
+{
+	struct intel_iommu *iommu = info->iommu;
+
+	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
+	if (sm_supported(iommu))
+		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
+	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
+	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
+}
+
+static int context_flip_pri(struct device_domain_info *info, bool enable)
+{
+	struct intel_iommu *iommu = info->iommu;
+	u8 bus = info->bus, devfn = info->devfn;
+	struct context_entry *context;
+
+	spin_lock(&iommu->lock);
+	if (context_copied(iommu, bus, devfn)) {
+		spin_unlock(&iommu->lock);
+		return -EINVAL;
+	}
+
+	context = iommu_context_addr(iommu, bus, devfn, false);
+	if (!context || !context_present(context)) {
+		spin_unlock(&iommu->lock);
+		return -ENODEV;
+	}
+
+	if (enable)
+		context_set_sm_pre(context);
+	else
+		context_clear_sm_pre(context);
+
+	if (!ecap_coherent(iommu->ecap))
+		clflush_cache_range(context, sizeof(*context));
+	invalidate_present_context_change(info);
+	spin_unlock(&iommu->lock);
+
+	return 0;
+}
+
 static int intel_iommu_enable_iopf(struct device *dev)
 {
 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
@@ -4242,15 +4293,23 @@ static int intel_iommu_enable_iopf(struct device *dev)
 	if (ret)
 		return ret;
 
+	ret = context_flip_pri(info, true);
+	if (ret)
+		goto err_remove_device;
+
 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
-	if (ret) {
-		iopf_queue_remove_device(iommu->iopf_queue, dev);
-		return ret;
-	}
+	if (ret)
+		goto err_clear_pri;
 
 	info->pri_enabled = 1;
 
 	return 0;
+err_clear_pri:
+	context_flip_pri(info, false);
+err_remove_device:
+	iopf_queue_remove_device(iommu->iopf_queue, dev);
+
+	return ret;
 }
 
 static int intel_iommu_disable_iopf(struct device *dev)
@@ -4261,6 +4320,15 @@ static int intel_iommu_disable_iopf(struct device *dev)
 	if (!info->pri_enabled)
 		return -EINVAL;
 
+	/* Disable new PRI reception: */
+	context_flip_pri(info, false);
+
+	/*
+	 * Remove device from fault queue and acknowledge all outstanding
+	 * PRQs to the device:
+	 */
+	iopf_queue_remove_device(iommu->iopf_queue, dev);
+
 	/*
 	 * PCIe spec states that by clearing PRI enable bit, the Page
 	 * Request Interface will not issue new page requests, but has
@@ -4271,7 +4339,6 @@ static int intel_iommu_disable_iopf(struct device *dev)
 	 */
 	pci_disable_pri(to_pci_dev(dev));
 	info->pri_enabled = 0;
-	iopf_queue_remove_device(iommu->iopf_queue, dev);
 
 	return 0;
 }
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index abce19e2ad6f..286a8a7d66f5 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -768,8 +768,6 @@ static int context_entry_set_pasid_table(struct context_entry *context,
 
 	if (info->ats_supported)
 		context_set_sm_dte(context);
-	if (info->pri_supported)
-		context_set_sm_pre(context);
 	if (info->pasid_supported)
 		context_set_pasid(context);
 
-- 
2.34.1

RE: [PATCH] iommu/vt-d: Refactor PCI PRI enabling/disabling callbacks

Posted by Tian, Kevin 1 year, 7 months ago

> From: Lu Baolu <baolu.lu@linux.intel.com>
> Sent: Thursday, June 6, 2024 11:40 AM
> 
> +/*
> + * Invalidate the caches for a present-to-present change in a context
> + * table entry according to the Spec 6.5.3.3 (Guidance to Software for
> + * Invalidations).
> + *
> + * Since context entry is not encoded by domain-id when operating in
> + * scalable-mode (refer Section 6.2.1), this performs coarser
> + * invalidation than the domain-selective granularity requested.
> + */
> +static void invalidate_present_context_change(struct device_domain_info
> *info)
> +{
> +	struct intel_iommu *iommu = info->iommu;
> +
> +	iommu->flush.flush_context(iommu, 0, 0, 0,
> DMA_CCMD_GLOBAL_INVL);
> +	if (sm_supported(iommu))
> +		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
> +	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
> +	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
> +}
> +

this invalidates the entire cache/iotlb for all devices behind this
iommu just due to a PRI enable/disable operation on a single
device.

No that's way too much. If there is a burden to identify all active
DIDs used by this device then pay it and penalize only that device.

btw in concept PRI will not be enabled/disabled when there are
PASIDs of this device being actively attached. So at this point
there should only be RID with attached domain then we only
need to find that DID out and use it to invalidate related caches.

Re: [PATCH] iommu/vt-d: Refactor PCI PRI enabling/disabling callbacks

Posted by Jason Gunthorpe 1 year, 7 months ago

On Wed, Jun 26, 2024 at 06:53:04AM +0000, Tian, Kevin wrote:

> btw in concept PRI will not be enabled/disabled when there are
> PASIDs of this device being actively attached. 

Please don't bake such an assumption in.

non-PRI paging domains can be attached to a PASID and should not
enable PRI.

PRI is enabled when the first PRI domain is attached to a RID/PASID
and disabled when the last PRI domain is detached.

Jason

Re: [PATCH] iommu/vt-d: Refactor PCI PRI enabling/disabling callbacks

Posted by Baolu Lu 1 year, 7 months ago

On 2024/6/26 14:53, Tian, Kevin wrote:
>> From: Lu Baolu <baolu.lu@linux.intel.com>
>> Sent: Thursday, June 6, 2024 11:40 AM
>>
>> +/*
>> + * Invalidate the caches for a present-to-present change in a context
>> + * table entry according to the Spec 6.5.3.3 (Guidance to Software for
>> + * Invalidations).
>> + *
>> + * Since context entry is not encoded by domain-id when operating in
>> + * scalable-mode (refer Section 6.2.1), this performs coarser
>> + * invalidation than the domain-selective granularity requested.
>> + */
>> +static void invalidate_present_context_change(struct device_domain_info
>> *info)
>> +{
>> +	struct intel_iommu *iommu = info->iommu;
>> +
>> +	iommu->flush.flush_context(iommu, 0, 0, 0,
>> DMA_CCMD_GLOBAL_INVL);
>> +	if (sm_supported(iommu))
>> +		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
>> +	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
>> +	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
>> +}
>> +
> 
> this invalidates the entire cache/iotlb for all devices behind this
> iommu just due to a PRI enable/disable operation on a single
> device.
> 
> No that's way too much. If there is a burden to identify all active
> DIDs used by this device then pay it and penalize only that device.

You are right. We should not simplify the flow like this.

> 
> btw in concept PRI will not be enabled/disabled when there are
> PASIDs of this device being actively attached. So at this point
> there should only be RID with attached domain then we only
> need to find that DID out and use it to invalidate related caches.

The assumption of "PRI will not be enabled/disabled when there are
PASIDs of this device being actively attached" is not always correct.
Both the pasid domain attachment and PRI are controlled by the device
driver and there is no order rules for the drivers.

For example, the idxd driver attaches the default domain to a PASID and
use it for kernel ENQCMD and use other PASIDs for SVA usage.

I am considering working out a generic helper to handle caches after
change to a context entry what was present. How do you like below code
(compiled but not tested)?

/*
  * Cache invalidations after change in a context table entry that was 
present
  * according to the Spec 6.5.3.3 (Guidance to Software for 
Invalidations). If
  * IOMMU is in scalable mode and all PASID table entries of the device were
  * non-present, set affect_domains to true. Otherwise, false.
  */
void intel_context_flush_present(struct device_domain_info *info,
				 struct context_entry *context,
				 bool affect_domains)
{
	struct intel_iommu *iommu = info->iommu;
	u16 did = context_domain_id(context);
	struct pasid_entry *pte;
	int i;

	assert_spin_locked(&iommu->lock);

	/*
	 * Device-selective context-cache invalidation. The Domain-ID field
	 * of the Context-cache Invalidate Descriptor is ignored by hardware
	 * when operating in scalable mode. Therefore the @did value doesn't
	 * matter in scalable mode.
	 */
	iommu->flush.flush_context(iommu, did, PCI_DEVID(info->bus, info->devfn),
				   DMA_CCMD_MASK_NOBIT, DMA_CCMD_DEVICE_INVL);

	/*
	 * For legacy mode:
	 * - Domain-selective IOTLB invalidation
	 * - Global Device-TLB invalidation to all affected functions
	 */
	if (!sm_supported(iommu)) {
		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
		__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);

		return;
	}

	/*
	 * For scalable mode:
	 * - Domain-selective PASID-cache invalidation to affected domains
	 * - Domain-selective IOTLB invalidation to affected domains
	 * - Global Device-TLB invalidation to affected functions
	 */
	if (affect_domains) {
		for (i = 0; i < info->pasid_table->max_pasid; i++) {
			pte = intel_pasid_get_entry(info->dev, i);
			if (!pte || !pasid_pte_is_present(pte))
				continue;

			did = pasid_get_domain_id(pte);
			qi_flush_pasid_cache(iommu, did, QI_PC_ALL_PASIDS, 0);
			iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
		}
	}

	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
}

Best regards,
baolu

RE: [PATCH] iommu/vt-d: Refactor PCI PRI enabling/disabling callbacks

Posted by Tian, Kevin 1 year, 7 months ago

> From: Baolu Lu <baolu.lu@linux.intel.com>
> Sent: Wednesday, June 26, 2024 9:05 PM
> 
> On 2024/6/26 14:53, Tian, Kevin wrote:
> >> From: Lu Baolu <baolu.lu@linux.intel.com>
> >> Sent: Thursday, June 6, 2024 11:40 AM
> >>
> >> +/*
> >> + * Invalidate the caches for a present-to-present change in a context
> >> + * table entry according to the Spec 6.5.3.3 (Guidance to Software for
> >> + * Invalidations).
> >> + *
> >> + * Since context entry is not encoded by domain-id when operating in
> >> + * scalable-mode (refer Section 6.2.1), this performs coarser
> >> + * invalidation than the domain-selective granularity requested.
> >> + */
> >> +static void invalidate_present_context_change(struct
> device_domain_info
> >> *info)
> >> +{
> >> +	struct intel_iommu *iommu = info->iommu;
> >> +
> >> +	iommu->flush.flush_context(iommu, 0, 0, 0,
> >> DMA_CCMD_GLOBAL_INVL);
> >> +	if (sm_supported(iommu))
> >> +		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
> >> +	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
> >> +	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
> >> +}
> >> +
> >
> > this invalidates the entire cache/iotlb for all devices behind this
> > iommu just due to a PRI enable/disable operation on a single
> > device.
> >
> > No that's way too much. If there is a burden to identify all active
> > DIDs used by this device then pay it and penalize only that device.
> 
> You are right. We should not simplify the flow like this.
> 
> >
> > btw in concept PRI will not be enabled/disabled when there are
> > PASIDs of this device being actively attached. So at this point
> > there should only be RID with attached domain then we only
> > need to find that DID out and use it to invalidate related caches.
> 
> The assumption of "PRI will not be enabled/disabled when there are
> PASIDs of this device being actively attached" is not always correct.
> Both the pasid domain attachment and PRI are controlled by the device
> driver and there is no order rules for the drivers.

Yeah. Not sure how I got it wrong in previous reply. 😊

> 
> For example, the idxd driver attaches the default domain to a PASID and
> use it for kernel ENQCMD and use other PASIDs for SVA usage.
> 
> I am considering working out a generic helper to handle caches after
> change to a context entry what was present. How do you like below code
> (compiled but not tested)?
> 
> /*
>   * Cache invalidations after change in a context table entry that was
> present
>   * according to the Spec 6.5.3.3 (Guidance to Software for
> Invalidations). If
>   * IOMMU is in scalable mode and all PASID table entries of the device were
>   * non-present, set affect_domains to true. Otherwise, false.
>   */
> void intel_context_flush_present(struct device_domain_info *info,
> 				 struct context_entry *context,
> 				 bool affect_domains)
> {
> 	struct intel_iommu *iommu = info->iommu;
> 	u16 did = context_domain_id(context);
> 	struct pasid_entry *pte;
> 	int i;
> 
> 	assert_spin_locked(&iommu->lock);
> 
> 	/*
> 	 * Device-selective context-cache invalidation. The Domain-ID field
> 	 * of the Context-cache Invalidate Descriptor is ignored by hardware
> 	 * when operating in scalable mode. Therefore the @did value
> doesn't
> 	 * matter in scalable mode.
> 	 */
> 	iommu->flush.flush_context(iommu, did, PCI_DEVID(info->bus, info-
> >devfn),
> 				   DMA_CCMD_MASK_NOBIT,
> DMA_CCMD_DEVICE_INVL);
> 
> 	/*
> 	 * For legacy mode:
> 	 * - Domain-selective IOTLB invalidation
> 	 * - Global Device-TLB invalidation to all affected functions
> 	 */
> 	if (!sm_supported(iommu)) {
> 		iommu->flush.flush_iotlb(iommu, did, 0, 0,
> DMA_TLB_DSI_FLUSH);
> 		__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
> 
> 		return;
> 	}
> 
> 	/*
> 	 * For scalable mode:
> 	 * - Domain-selective PASID-cache invalidation to affected domains
> 	 * - Domain-selective IOTLB invalidation to affected domains
> 	 * - Global Device-TLB invalidation to affected functions
> 	 */
> 	if (affect_domains) {
> 		for (i = 0; i < info->pasid_table->max_pasid; i++) {
> 			pte = intel_pasid_get_entry(info->dev, i);
> 			if (!pte || !pasid_pte_is_present(pte))
> 				continue;
> 
> 			did = pasid_get_domain_id(pte);
> 			qi_flush_pasid_cache(iommu, did,
> QI_PC_ALL_PASIDS, 0);
> 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
> DMA_TLB_DSI_FLUSH);
> 		}
> 	}
> 
> 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
> }
> 

looks good to me.