From: Leon Romanovsky <leonro@nvidia.com>
Export the pci_p2pdma_map_type() function to allow external modules
and subsystems to determine the appropriate mapping type for P2PDMA
transfers between a provider and target device.
The function determines whether peer-to-peer DMA transfers can be
done directly through PCI switches (PCI_P2PDMA_MAP_BUS_ADDR) or
must go through the host bridge (PCI_P2PDMA_MAP_THRU_HOST_BRIDGE),
or if the transfer is not supported at all.
This export enables subsystems like VFIO to properly handle P2PDMA
operations by querying the mapping type before attempting transfers,
ensuring correct DMA address programming and error handling.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/pci/p2pdma.c | 15 ++++++-
include/linux/pci-p2pdma.h | 85 +++++++++++++++++++++-----------------
2 files changed, 59 insertions(+), 41 deletions(-)
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 8e2525618d922..326c7d88a1690 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -1014,8 +1014,18 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
}
EXPORT_SYMBOL_GPL(pci_p2pmem_publish);
-static enum pci_p2pdma_map_type
-pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev)
+/**
+ * pci_p2pdma_map_type - Determine the mapping type for P2PDMA transfers
+ * @provider: P2PDMA provider structure
+ * @dev: Target device for the transfer
+ *
+ * Determines how peer-to-peer DMA transfers should be mapped between
+ * the provider and the target device. The mapping type indicates whether
+ * the transfer can be done directly through PCI switches or must go
+ * through the host bridge.
+ */
+enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider,
+ struct device *dev)
{
enum pci_p2pdma_map_type type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
struct pci_dev *pdev = to_pci_dev(provider->owner);
@@ -1044,6 +1054,7 @@ pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev)
return type;
}
+EXPORT_SYMBOL_GPL(pci_p2pdma_map_type);
void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state,
struct device *dev, struct page *page)
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 83f11dc8659a7..dea98baee5ce2 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -26,6 +26,45 @@ struct p2pdma_provider {
u64 bus_offset;
};
+enum pci_p2pdma_map_type {
+ /*
+ * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before
+ * the mapping type has been calculated. Exported routines for the API
+ * will never return this value.
+ */
+ PCI_P2PDMA_MAP_UNKNOWN = 0,
+
+ /*
+ * Not a PCI P2PDMA transfer.
+ */
+ PCI_P2PDMA_MAP_NONE,
+
+ /*
+ * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will
+ * traverse the host bridge and the host bridge is not in the
+ * allowlist. DMA Mapping routines should return an error when
+ * this is returned.
+ */
+ PCI_P2PDMA_MAP_NOT_SUPPORTED,
+
+ /*
+ * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to
+ * each other directly through a PCI switch and the transaction will
+ * not traverse the host bridge. Such a mapping should program
+ * the DMA engine with PCI bus addresses.
+ */
+ PCI_P2PDMA_MAP_BUS_ADDR,
+
+ /*
+ * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk
+ * to each other, but the transaction traverses a host bridge on the
+ * allowlist. In this case, a normal mapping either with CPU physical
+ * addresses (in the case of dma-direct) or IOVA addresses (in the
+ * case of IOMMUs) should be used to program the DMA engine.
+ */
+ PCI_P2PDMA_MAP_THRU_HOST_BRIDGE,
+};
+
#ifdef CONFIG_PCI_P2PDMA
struct p2pdma_provider *pci_p2pdma_enable(struct pci_dev *pdev);
int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
@@ -45,6 +84,8 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
bool *use_p2pdma);
ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
bool use_p2pdma);
+enum pci_p2pdma_map_type pci_p2pdma_map_type(struct p2pdma_provider *provider,
+ struct device *dev);
#else /* CONFIG_PCI_P2PDMA */
static inline struct p2pdma_provider *pci_p2pdma_enable(struct pci_dev *pdev)
{
@@ -105,6 +146,11 @@ static inline ssize_t pci_p2pdma_enable_show(char *page,
{
return sprintf(page, "none\n");
}
+static inline enum pci_p2pdma_map_type
+pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev)
+{
+ return PCI_P2PDMA_MAP_NOT_SUPPORTED;
+}
#endif /* CONFIG_PCI_P2PDMA */
@@ -119,45 +165,6 @@ static inline struct pci_dev *pci_p2pmem_find(struct device *client)
return pci_p2pmem_find_many(&client, 1);
}
-enum pci_p2pdma_map_type {
- /*
- * PCI_P2PDMA_MAP_UNKNOWN: Used internally as an initial state before
- * the mapping type has been calculated. Exported routines for the API
- * will never return this value.
- */
- PCI_P2PDMA_MAP_UNKNOWN = 0,
-
- /*
- * Not a PCI P2PDMA transfer.
- */
- PCI_P2PDMA_MAP_NONE,
-
- /*
- * PCI_P2PDMA_MAP_NOT_SUPPORTED: Indicates the transaction will
- * traverse the host bridge and the host bridge is not in the
- * allowlist. DMA Mapping routines should return an error when
- * this is returned.
- */
- PCI_P2PDMA_MAP_NOT_SUPPORTED,
-
- /*
- * PCI_P2PDMA_MAP_BUS_ADDR: Indicates that two devices can talk to
- * each other directly through a PCI switch and the transaction will
- * not traverse the host bridge. Such a mapping should program
- * the DMA engine with PCI bus addresses.
- */
- PCI_P2PDMA_MAP_BUS_ADDR,
-
- /*
- * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: Indicates two devices can talk
- * to each other, but the transaction traverses a host bridge on the
- * allowlist. In this case, a normal mapping either with CPU physical
- * addresses (in the case of dma-direct) or IOVA addresses (in the
- * case of IOMMUs) should be used to program the DMA engine.
- */
- PCI_P2PDMA_MAP_THRU_HOST_BRIDGE,
-};
-
struct pci_p2pdma_map_state {
struct p2pdma_provider *mem;
enum pci_p2pdma_map_type map;
--
2.50.1
On Wed, Jul 23, 2025 at 04:00:06PM +0300, Leon Romanovsky wrote: > From: Leon Romanovsky <leonro@nvidia.com> > > Export the pci_p2pdma_map_type() function to allow external modules > and subsystems to determine the appropriate mapping type for P2PDMA > transfers between a provider and target device. External modules have no business doing this.
On Thu, Jul 24, 2025 at 10:03:13AM +0200, Christoph Hellwig wrote: > On Wed, Jul 23, 2025 at 04:00:06PM +0300, Leon Romanovsky wrote: > > From: Leon Romanovsky <leonro@nvidia.com> > > > > Export the pci_p2pdma_map_type() function to allow external modules > > and subsystems to determine the appropriate mapping type for P2PDMA > > transfers between a provider and target device. > > External modules have no business doing this. So what's the plan? Today the new DMA API broadly has the pattern: switch (pci_p2pdma_state(p2pdma_state, dev, page)) { [..] if (dma_use_iova(state)) { ret = dma_iova_link(dev, state, paddr, offset, [..] } else { dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size, [..] You can't fully use the new API flow without calling pci_p2pdma_state(), which is also not exported today. Is the idea the full new DMA API flow should not be available to modules? We did export dma_iova_link(). Otherwise, the p2p step needs two functions - a struct page-full and a struct page-less version, and they need to be exported. The names here are not so good, it would be nicer to have them be a dma_* prefixed function since they are used with the other dma_ functions. Jason
On Thu, Jul 24, 2025 at 10:03:13AM +0200, Christoph Hellwig wrote: > On Wed, Jul 23, 2025 at 04:00:06PM +0300, Leon Romanovsky wrote: > > From: Leon Romanovsky <leonro@nvidia.com> > > > > Export the pci_p2pdma_map_type() function to allow external modules > > and subsystems to determine the appropriate mapping type for P2PDMA > > transfers between a provider and target device. > > External modules have no business doing this. VFIO PCI code is built as module. There is no way to access PCI p2p code without exporting functions in it. Thanks > >
On Thu, Jul 24, 2025 at 11:13:21AM +0300, Leon Romanovsky wrote: > On Thu, Jul 24, 2025 at 10:03:13AM +0200, Christoph Hellwig wrote: > > On Wed, Jul 23, 2025 at 04:00:06PM +0300, Leon Romanovsky wrote: > > > From: Leon Romanovsky <leonro@nvidia.com> > > > > > > Export the pci_p2pdma_map_type() function to allow external modules > > > and subsystems to determine the appropriate mapping type for P2PDMA > > > transfers between a provider and target device. > > > > External modules have no business doing this. > > VFIO PCI code is built as module. There is no way to access PCI p2p code > without exporting functions in it. We never ever export anything for "external" modules, and you really should know that.
On Tue, Jul 29, 2025 at 09:52:30AM +0200, Christoph Hellwig wrote: > On Thu, Jul 24, 2025 at 11:13:21AM +0300, Leon Romanovsky wrote: > > On Thu, Jul 24, 2025 at 10:03:13AM +0200, Christoph Hellwig wrote: > > > On Wed, Jul 23, 2025 at 04:00:06PM +0300, Leon Romanovsky wrote: > > > > From: Leon Romanovsky <leonro@nvidia.com> > > > > > > > > Export the pci_p2pdma_map_type() function to allow external modules > > > > and subsystems to determine the appropriate mapping type for P2PDMA > > > > transfers between a provider and target device. > > > > > > External modules have no business doing this. > > > > VFIO PCI code is built as module. There is no way to access PCI p2p code > > without exporting functions in it. > > We never ever export anything for "external" modules, and you really > should know that. It is just a wrong word in commit message. I clearly need it for vfio-pci module and nothing more. "Never attribute to malice that which is adequately explained by stupidity." - Hanlon's razor. Thanks > >
On 2025-07-24 02:13, Leon Romanovsky wrote: > On Thu, Jul 24, 2025 at 10:03:13AM +0200, Christoph Hellwig wrote: >> On Wed, Jul 23, 2025 at 04:00:06PM +0300, Leon Romanovsky wrote: >>> From: Leon Romanovsky <leonro@nvidia.com> >>> >>> Export the pci_p2pdma_map_type() function to allow external modules >>> and subsystems to determine the appropriate mapping type for P2PDMA >>> transfers between a provider and target device. >> >> External modules have no business doing this. > > VFIO PCI code is built as module. There is no way to access PCI p2p code > without exporting functions in it. The solution that would make more sense to me would be for either dma_iova_try_alloc() or another helper in dma-iommu.c to handle the P2PDMA case. dma-iommu.c already uses those same interfaces and thus there would be no need to export the low level helpers from the p2pdma code. Logan
On Fri, Jul 25, 2025 at 10:30:46AM -0600, Logan Gunthorpe wrote: > > > On 2025-07-24 02:13, Leon Romanovsky wrote: > > On Thu, Jul 24, 2025 at 10:03:13AM +0200, Christoph Hellwig wrote: > >> On Wed, Jul 23, 2025 at 04:00:06PM +0300, Leon Romanovsky wrote: > >>> From: Leon Romanovsky <leonro@nvidia.com> > >>> > >>> Export the pci_p2pdma_map_type() function to allow external modules > >>> and subsystems to determine the appropriate mapping type for P2PDMA > >>> transfers between a provider and target device. > >> > >> External modules have no business doing this. > > > > VFIO PCI code is built as module. There is no way to access PCI p2p code > > without exporting functions in it. > > The solution that would make more sense to me would be for either > dma_iova_try_alloc() or another helper in dma-iommu.c to handle the > P2PDMA case. This has nothing to do with dma-iommu.c, the decisions here still need to be made even if dma-iommu.c is not compiled in. It could be exported from the main dma code, but I think it would just be a 1 line wrapper around the existing function? I'd rather rename the functions and leave them in the p2pdma.c files... Jason
On 2025-07-27 13:05, Jason Gunthorpe wrote: > On Fri, Jul 25, 2025 at 10:30:46AM -0600, Logan Gunthorpe wrote: >> >> >> On 2025-07-24 02:13, Leon Romanovsky wrote: >>> On Thu, Jul 24, 2025 at 10:03:13AM +0200, Christoph Hellwig wrote: >>>> On Wed, Jul 23, 2025 at 04:00:06PM +0300, Leon Romanovsky wrote: >>>>> From: Leon Romanovsky <leonro@nvidia.com> >>>>> >>>>> Export the pci_p2pdma_map_type() function to allow external modules >>>>> and subsystems to determine the appropriate mapping type for P2PDMA >>>>> transfers between a provider and target device. >>>> >>>> External modules have no business doing this. >>> >>> VFIO PCI code is built as module. There is no way to access PCI p2p code >>> without exporting functions in it. >> >> The solution that would make more sense to me would be for either >> dma_iova_try_alloc() or another helper in dma-iommu.c to handle the >> P2PDMA case. > > This has nothing to do with dma-iommu.c, the decisions here still need > to be made even if dma-iommu.c is not compiled in. Doesn't it though? Every single call in patch 10 to the newly exported PCI functions calls into the the dma-iommu functions. If there were non-iommu paths then I would expect the code would use the regular DMA api directly which would then call in to dma-iommu. I can't imagine a use case where someone would want to call the p2pdma functions to map p2p memory and not have a similar path to do the exact same mapping with vanilla memory and thus call the DMA API. And it seems much better to me to export higher level functions to drivers that take care of the details correctly than to expose the nuts and bolts to every driver. The thing that seems special to me about VFIO is that it is calling directly into dma-iommu code to setup unique mappings as opposed to using the higher level DMA API. I don't see in what way it is special that it needs to know intimate details of the memory it's mapping and have different paths to map different types of memory. That's what the dma layer is for. Logan
On Mon, Jul 28, 2025 at 10:12:31AM -0600, Logan Gunthorpe wrote: > > > On 2025-07-27 13:05, Jason Gunthorpe wrote: > > On Fri, Jul 25, 2025 at 10:30:46AM -0600, Logan Gunthorpe wrote: > >> > >> > >> On 2025-07-24 02:13, Leon Romanovsky wrote: > >>> On Thu, Jul 24, 2025 at 10:03:13AM +0200, Christoph Hellwig wrote: > >>>> On Wed, Jul 23, 2025 at 04:00:06PM +0300, Leon Romanovsky wrote: > >>>>> From: Leon Romanovsky <leonro@nvidia.com> > >>>>> > >>>>> Export the pci_p2pdma_map_type() function to allow external modules > >>>>> and subsystems to determine the appropriate mapping type for P2PDMA > >>>>> transfers between a provider and target device. > >>>> > >>>> External modules have no business doing this. > >>> > >>> VFIO PCI code is built as module. There is no way to access PCI p2p code > >>> without exporting functions in it. > >> > >> The solution that would make more sense to me would be for either > >> dma_iova_try_alloc() or another helper in dma-iommu.c to handle the > >> P2PDMA case. > > > > This has nothing to do with dma-iommu.c, the decisions here still need > > to be made even if dma-iommu.c is not compiled in. > > Doesn't it though? Every single call in patch 10 to the newly exported > PCI functions calls into the the dma-iommu functions. If there were > non-iommu paths then I would expect the code would use the regular DMA > api directly which would then call in to dma-iommu. If p2p type is PCI_P2PDMA_MAP_BUS_ADDR, there will no dma-iommu and DMA at all. +static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, + struct dma_buf_attachment *attachment) +{ + struct vfio_pci_dma_buf *priv = dmabuf->priv; + + if (!attachment->peer2peer) + return -EOPNOTSUPP; + + if (priv->revoked) + return -ENODEV; + + switch (pci_p2pdma_map_type(priv->vdev->provider, attachment->dev)) { + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + break; + case PCI_P2PDMA_MAP_BUS_ADDR: + /* + * There is no need in IOVA at all for this flow. + * We rely on attachment->priv == NULL as a marker + * for this mode. + */ + return 0; + default: + return -EINVAL; + } + + attachment->priv = kzalloc(sizeof(struct dma_iova_state), GFP_KERNEL); + if (!attachment->priv) + return -ENOMEM; + + dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv->phys_vec.len); + return 0; +}
On 2025-07-28 10:41, Leon Romanovsky wrote: > On Mon, Jul 28, 2025 at 10:12:31AM -0600, Logan Gunthorpe wrote: >> >> >> On 2025-07-27 13:05, Jason Gunthorpe wrote: >>> On Fri, Jul 25, 2025 at 10:30:46AM -0600, Logan Gunthorpe wrote: >>>> >>>> >>>> On 2025-07-24 02:13, Leon Romanovsky wrote: >>>>> On Thu, Jul 24, 2025 at 10:03:13AM +0200, Christoph Hellwig wrote: >>>>>> On Wed, Jul 23, 2025 at 04:00:06PM +0300, Leon Romanovsky wrote: >>>>>>> From: Leon Romanovsky <leonro@nvidia.com> >>>>>>> >>>>>>> Export the pci_p2pdma_map_type() function to allow external modules >>>>>>> and subsystems to determine the appropriate mapping type for P2PDMA >>>>>>> transfers between a provider and target device. >>>>>> >>>>>> External modules have no business doing this. >>>>> >>>>> VFIO PCI code is built as module. There is no way to access PCI p2p code >>>>> without exporting functions in it. >>>> >>>> The solution that would make more sense to me would be for either >>>> dma_iova_try_alloc() or another helper in dma-iommu.c to handle the >>>> P2PDMA case. >>> >>> This has nothing to do with dma-iommu.c, the decisions here still need >>> to be made even if dma-iommu.c is not compiled in. >> >> Doesn't it though? Every single call in patch 10 to the newly exported >> PCI functions calls into the the dma-iommu functions. If there were >> non-iommu paths then I would expect the code would use the regular DMA >> api directly which would then call in to dma-iommu. > > If p2p type is PCI_P2PDMA_MAP_BUS_ADDR, there will no dma-iommu and DMA > at all. I understand that and it is completely beside my point. If the dma mapping for P2P memory doesn't need to create an iommu mapping then that's fine. But it should be the dma-iommu layer to decide that. It's not a decision that should be made by every driver doing this kind of thing. With P2PDMA memory we are still creating a DMA mapping. It's just the dma address will be a PCI bus address instead of an IOVA. My opinion remains: none of these details should be exposed to the drivers. Logan
On Mon, Jul 28, 2025 at 11:07:34AM -0600, Logan Gunthorpe wrote: > > > On 2025-07-28 10:41, Leon Romanovsky wrote: > > On Mon, Jul 28, 2025 at 10:12:31AM -0600, Logan Gunthorpe wrote: > >> > >> > >> On 2025-07-27 13:05, Jason Gunthorpe wrote: > >>> On Fri, Jul 25, 2025 at 10:30:46AM -0600, Logan Gunthorpe wrote: > >>>> > >>>> > >>>> On 2025-07-24 02:13, Leon Romanovsky wrote: > >>>>> On Thu, Jul 24, 2025 at 10:03:13AM +0200, Christoph Hellwig wrote: > >>>>>> On Wed, Jul 23, 2025 at 04:00:06PM +0300, Leon Romanovsky wrote: > >>>>>>> From: Leon Romanovsky <leonro@nvidia.com> > >>>>>>> > >>>>>>> Export the pci_p2pdma_map_type() function to allow external modules > >>>>>>> and subsystems to determine the appropriate mapping type for P2PDMA > >>>>>>> transfers between a provider and target device. > >>>>>> > >>>>>> External modules have no business doing this. > >>>>> > >>>>> VFIO PCI code is built as module. There is no way to access PCI p2p code > >>>>> without exporting functions in it. > >>>> > >>>> The solution that would make more sense to me would be for either > >>>> dma_iova_try_alloc() or another helper in dma-iommu.c to handle the > >>>> P2PDMA case. > >>> > >>> This has nothing to do with dma-iommu.c, the decisions here still need > >>> to be made even if dma-iommu.c is not compiled in. > >> > >> Doesn't it though? Every single call in patch 10 to the newly exported > >> PCI functions calls into the the dma-iommu functions. Patch 10 has lots of flows, only one will end up in dma-iommu.c vfio_pci_dma_buf_map() calls pci_p2pdma_bus_addr_map(), dma_iova_link(), dma_map_phys(). Only iova_link would call to dma-iommu.c - if dma_map_phys() is called we know that dma-iommu.c won't be called by it. > >> If there were non-iommu paths then I would expect the code would > >> use the regular DMA api directly which would then call in to > >> dma-iommu. > > > > If p2p type is PCI_P2PDMA_MAP_BUS_ADDR, there will no dma-iommu and DMA > > at all. > > I understand that and it is completely beside my point. > > If the dma mapping for P2P memory doesn't need to create an iommu > mapping then that's fine. But it should be the dma-iommu layer to decide > that. So above, we can't use dma-iommu.c, it might not be compiled into the kernel but the dma_map_phys() path is still valid. > It's not a decision that should be made by every driver doing this > kind of thing. Sort of, I think we are trying to get to some place where there are subsystem, or at least data structure specific helpers that do this (ie nvme has BIO helpers), but the helpers should be running this logic directly for performance. Leon hasn't done it but I think we should see helpers for DMABUF too encapsulating the logic shown in patch 10. I think we need to prove it out these basic points first before trying to go and convert a bunch of GPU drivers. The vfio in patch 10 is not the full example since it only has a single scatter/gather" effectively, but the generalized version loops over pci_p2pdma_bus_addr_map(), dma_iova_link(), dma_map_phys() for each page. Part of the new API design is to only do one kind of mapping operation at once, and part of the design is we know that the P2P type is fixed. It makes no performance sense to check the type inside the pci_p2pdma_bus_addr_map()/ dma_iova_link()/dma_map_phys() within the per-page loop. I do think some level of abstraction has been lost here in pursuit of performance. If someone does have a better way to structure this without a performance hit then fantastic, but thats going back and revising the new DMA API. This just builds on top of that, and yes, it is not so abstract. Jason
On 2025-07-28 17:11, Jason Gunthorpe wrote: >> If the dma mapping for P2P memory doesn't need to create an iommu >> mapping then that's fine. But it should be the dma-iommu layer to decide >> that. > > So above, we can't use dma-iommu.c, it might not be compiled into the > kernel but the dma_map_phys() path is still valid. This is an easily solved problem. I did a very rough sketch below to say it's really not that hard. (Note it has some rough edges that could be cleaned up and I based it off Leon's git repo which appears to not be the same as what was posted, but the core concept is sound). Logan diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 1853a969e197..da1a6003620a 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1806,6 +1806,22 @@ bool dma_iova_try_alloc(struct device *dev, struct dma_iova_state *state, } EXPORT_SYMBOL_GPL(dma_iova_try_alloc); +void dma_iova_try_alloc_p2p(struct p2pdma_provider *provider, struct device *dev, + struct dma_iova_state *state, phys_addr_t phys, size_t size) +{ + switch (pci_p2pdma_map_type(provider, dev)) { + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: + dma_iova_try_alloc(dev, state, phys, size); + return; + case PCI_P2PDMA_MAP_BUS_ADDR: + state->bus_addr = true; + return; + default: + return; + } +} +EXPORT_SYMBOL_GPL(dma_iova_try_alloc_p2p); + /** * dma_iova_free - Free an IOVA space * @dev: Device to free the IOVA space for diff --git a/drivers/vfio/pci/vfio_pci_dmabuf.c b/drivers/vfio/pci/vfio_pci_dmabuf.c index 455541d21538..5749be3a9b58 100644 --- a/drivers/vfio/pci/vfio_pci_dmabuf.c +++ b/drivers/vfio/pci/vfio_pci_dmabuf.c @@ -30,25 +30,12 @@ static int vfio_pci_dma_buf_attach(struct dma_buf *dmabuf, if (priv->revoked) return -ENODEV; - switch (pci_p2pdma_map_type(priv->vdev->provider, attachment->dev)) { - case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: - break; - case PCI_P2PDMA_MAP_BUS_ADDR: - /* - * There is no need in IOVA at all for this flow. - * We rely on attachment->priv == NULL as a marker - * for this mode. - */ - return 0; - default: - return -EINVAL; - } - attachment->priv = kzalloc(sizeof(struct dma_iova_state), GFP_KERNEL); if (!attachment->priv) return -ENOMEM; - dma_iova_try_alloc(attachment->dev, attachment->priv, 0, priv->size); + dma_iova_try_alloc_p2p(priv->vdev->provider, attachment->dev, + attachment->priv, 0, priv->size); return 0; } @@ -98,26 +85,11 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, sgl = sgt->sgl; for (i = 0; i < priv->nr_ranges; i++) { - if (!state) { - addr = pci_p2pdma_bus_addr_map(provider, - phys_vec[i].paddr); - } else if (dma_use_iova(state)) { - ret = dma_iova_link(attachment->dev, state, - phys_vec[i].paddr, 0, - phys_vec[i].len, dir, attrs); - if (ret) - goto err_unmap_dma; - - mapped_len += phys_vec[i].len; - } else { - addr = dma_map_phys(attachment->dev, phys_vec[i].paddr, - phys_vec[i].len, dir, attrs); - ret = dma_mapping_error(attachment->dev, addr); - if (ret) - goto err_unmap_dma; - } + addr = dma_map_phys_prealloc(attachment->dev, phys_vec[i].paddr, + phys_vec[i].len, dir, attrs, state, + provider); - if (!state || !dma_use_iova(state)) { + if (addr != DMA_MAPPING_USE_IOVA) { /* * In IOVA case, there is only one SG entry which spans * for whole IOVA address space. So there is no need @@ -128,7 +100,7 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, } } - if (state && dma_use_iova(state)) { + if (addr == DMA_MAPPING_USE_IOVA) { WARN_ON_ONCE(mapped_len != priv->size); ret = dma_iova_sync(attachment->dev, state, 0, mapped_len); if (ret) @@ -139,7 +111,7 @@ vfio_pci_dma_buf_map(struct dma_buf_attachment *attachment, return sgt; err_unmap_dma: - if (!i || !state) + if (!i || state->bus_addr) ; /* Do nothing */ else if (dma_use_iova(state)) dma_iova_destroy(attachment->dev, state, mapped_len, dir, @@ -164,7 +136,7 @@ static void vfio_pci_dma_buf_unmap(struct dma_buf_attachment *attachment, struct scatterlist *sgl; int i; - if (!state) + if (state->bus_addr) ; /* Do nothing */ else if (dma_use_iova(state)) dma_iova_destroy(attachment->dev, state, priv->size, dir, diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index ba54bbeca861..675e5ac13265 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -70,11 +70,14 @@ */ #define DMA_MAPPING_ERROR (~(dma_addr_t)0) +#define DMA_MAPPING_USE_IOVA ((dma_addr_t)-2) + #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) struct dma_iova_state { dma_addr_t addr; u64 __size; + bool bus_addr; }; /* @@ -120,6 +123,12 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, enum dma_data_direction dir, unsigned long attrs); + +struct p2pdma_provider; +dma_addr_t dma_map_phys_prealloc(struct device *dev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs, + struct dma_iova_state *state, struct p2pdma_provider *provider); + void dma_unmap_phys(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs); unsigned int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, @@ -321,6 +330,8 @@ static inline bool dma_use_iova(struct dma_iova_state *state) bool dma_iova_try_alloc(struct device *dev, struct dma_iova_state *state, phys_addr_t phys, size_t size); +void dma_iova_try_alloc_p2p(struct p2pdma_provider *provider, struct device *dev, + struct dma_iova_state *state, phys_addr_t phys, size_t size); void dma_iova_free(struct device *dev, struct dma_iova_state *state); void dma_iova_destroy(struct device *dev, struct dma_iova_state *state, size_t mapped_len, enum dma_data_direction dir, @@ -343,6 +354,11 @@ static inline bool dma_iova_try_alloc(struct device *dev, { return false; } +static inline void dma_iova_try_alloc_p2p(struct p2pdma_provider *provider, + struct device *dev, struct dma_iova_state *state, phys_addr_t phys, + size_t size) +{ +} static inline void dma_iova_free(struct device *dev, struct dma_iova_state *state) { diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index e1586eb52ab3..b2110098a29b 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -13,6 +13,7 @@ #include <linux/iommu-dma.h> #include <linux/kmsan.h> #include <linux/of_device.h> +#include <linux/pci-p2pdma.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include "debug.h" @@ -202,6 +203,27 @@ dma_addr_t dma_map_phys(struct device *dev, phys_addr_t phys, size_t size, } EXPORT_SYMBOL_GPL(dma_map_phys); +dma_addr_t dma_map_phys_prealloc(struct device *dev, phys_addr_t phys, size_t size, + enum dma_data_direction dir, unsigned long attrs, + struct dma_iova_state *state, struct p2pdma_provider *provider) +{ + int ret; + + if (state->bus_addr) + return pci_p2pdma_bus_addr_map(provider, phys); + + if (dma_use_iova(state)) { + ret = dma_iova_link(dev, state, phys, 0, size, dir, attrs); + if (ret) + return DMA_MAPPING_ERROR; + + return DMA_MAPPING_USE_IOVA; + } + + return dma_map_phys(dev, phys, size, dir, attrs); +} +EXPORT_SYMBOL_GPL(dma_map_phys_prealloc); + dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs)
On Tue, Jul 29, 2025 at 02:54:13PM -0600, Logan Gunthorpe wrote: > > > On 2025-07-28 17:11, Jason Gunthorpe wrote: > >> If the dma mapping for P2P memory doesn't need to create an iommu > >> mapping then that's fine. But it should be the dma-iommu layer to decide > >> that. > > > > So above, we can't use dma-iommu.c, it might not be compiled into the > > kernel but the dma_map_phys() path is still valid. > > This is an easily solved problem. I did a very rough sketch below to say > it's really not that hard. (Note it has some rough edges that could be > cleaned up and I based it off Leon's git repo which appears to not be > the same as what was posted, but the core concept is sound). I started to prepare v2, this is why posted version is slightly different from dmabuf-vfio branch. In addition to what Jason wrote. there is an extra complexity with using state. The wrappers which operate on dma_iova_state assume that all memory, which is going to be mapped, is the same type: or p2p or not. This is not the cased for HMM/RDMA users, there you create state in advance and get mixed type of pages. Thanks
On Tue, Jul 29, 2025 at 02:54:13PM -0600, Logan Gunthorpe wrote: > > > On 2025-07-28 17:11, Jason Gunthorpe wrote: > >> If the dma mapping for P2P memory doesn't need to create an iommu > >> mapping then that's fine. But it should be the dma-iommu layer to decide > >> that. > > > > So above, we can't use dma-iommu.c, it might not be compiled into the > > kernel but the dma_map_phys() path is still valid. > > This is an easily solved problem. I did a very rough sketch below to say > it's really not that hard. (Note it has some rough edges that could be > cleaned up and I based it off Leon's git repo which appears to not be > the same as what was posted, but the core concept is sound). I did hope for something like this in the early days, but it proved not so easy to get agreements on details :( My feeling was we should get some actual examples of using this thing and then it is far easier to discuss ideas, like yours here, to improve it. Many of the discussions kind of got confused without enough actual usering code for everyone to refer to. For instance the nvme use case is a big driver for the API design, and it is quite different from these simpler flows, this idea needs to see how it would work there. Maybe this idea could also have provider = NULL meaning it is CPU cachable memory? > +static inline void dma_iova_try_alloc_p2p(struct p2pdma_provider *provider, > + struct device *dev, struct dma_iova_state *state, phys_addr_t phys, > + size_t size) > +{ > +} Can't be empty - PCI_P2PDMA_MAP_THRU_HOST_BRIDGE vs PCI_P2PDMA_MAP_BUS_ADDR still matters so it still must set dma_iova_state::bus_addr to get dma_map_phys_prealloc() to do the right thing. Still, it would make sense to put something like that in dma/mapping.c and rely on the static inline stub for dma_iova_try_alloc().. > for (i = 0; i < priv->nr_ranges; i++) { > - if (!state) { > - addr = pci_p2pdma_bus_addr_map(provider, > - phys_vec[i].paddr); > - } else if (dma_use_iova(state)) { > - ret = dma_iova_link(attachment->dev, state, > - phys_vec[i].paddr, 0, > - phys_vec[i].len, dir, attrs); > - if (ret) > - goto err_unmap_dma; > - > - mapped_len += phys_vec[i].len; > - } else { > - addr = dma_map_phys(attachment->dev, phys_vec[i].paddr, > - phys_vec[i].len, dir, attrs); > - ret = dma_mapping_error(attachment->dev, addr); > - if (ret) > - goto err_unmap_dma; > - } > + addr = dma_map_phys_prealloc(attachment->dev, phys_vec[i].paddr, > + phys_vec[i].len, dir, attrs, state, > + provider); There was a draft of something like this at some point. The DMA_MAPPING_USE_IOVA is a new twist though > #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) > struct dma_iova_state { > dma_addr_t addr; > u64 __size; > + bool bus_addr; > }; Gowing this structure has been strongly pushed back on. This probably can be solved in some other way, a bitfield on size perhaps.. > +dma_addr_t dma_map_phys_prealloc(struct device *dev, phys_addr_t phys, > size_t size, > + enum dma_data_direction dir, unsigned long attrs, > + struct dma_iova_state *state, struct p2pdma_provider *provider) > +{ > + int ret; > + > + if (state->bus_addr) > + return pci_p2pdma_bus_addr_map(provider, phys); > + > + if (dma_use_iova(state)) { > + ret = dma_iova_link(dev, state, phys, 0, size, dir, attrs); > + if (ret) > + return DMA_MAPPING_ERROR; > + > + return DMA_MAPPING_USE_IOVA; > + } > + > + return dma_map_phys(dev, phys, size, dir, attrs); > +} > +EXPORT_SYMBOL_GPL(dma_map_phys_prealloc); I would be tempted to inline this Overall, yeah I would certainly welcome improvements like this if everyone can agree, but I'd really like to see nvme merged before we start working on ideas. That way the proposal can be properly evaluated by all the stake holders. Jason
On Fri, Jul 25, 2025 at 10:30:46AM -0600, Logan Gunthorpe wrote: > > > On 2025-07-24 02:13, Leon Romanovsky wrote: > > On Thu, Jul 24, 2025 at 10:03:13AM +0200, Christoph Hellwig wrote: > >> On Wed, Jul 23, 2025 at 04:00:06PM +0300, Leon Romanovsky wrote: > >>> From: Leon Romanovsky <leonro@nvidia.com> > >>> > >>> Export the pci_p2pdma_map_type() function to allow external modules > >>> and subsystems to determine the appropriate mapping type for P2PDMA > >>> transfers between a provider and target device. > >> > >> External modules have no business doing this. > > > > VFIO PCI code is built as module. There is no way to access PCI p2p code > > without exporting functions in it. > > The solution that would make more sense to me would be for either > dma_iova_try_alloc() or another helper in dma-iommu.c to handle the > P2PDMA case. dma-iommu.c already uses those same interfaces and thus > there would be no need to export the low level helpers from the p2pdma code. I had same idea in early versions of DMA phys API discussion and it was pointed (absolutely right) that this is layering violation. At that time, that remark wasn't such clear to me because HMM code performs check for p2p on every page and has call to dma_iova_try_alloc() before that check. But this VFIO DMABUF code shows it much more clearer. The p2p check is performed before any DMA calls and in case of PCI_P2PDMA_MAP_BUS_ADDR p2p type between DMABUF exporter device and DMABUF importer device, we don't call dma_iova_try_alloc() or any DMA API at all. So unfortunately, I think that dma*.c|h is not right place for p2p type check. Thanks > > Logan >
On 2025-07-25 12:54, Leon Romanovsky wrote: >> The solution that would make more sense to me would be for either >> dma_iova_try_alloc() or another helper in dma-iommu.c to handle the >> P2PDMA case. dma-iommu.c already uses those same interfaces and thus >> there would be no need to export the low level helpers from the p2pdma code. > > I had same idea in early versions of DMA phys API discussion and it was > pointed (absolutely right) that this is layering violation. Respectfully, I have to disagree with this. Having the layer (ie. dma-iommu) that normally checks how to handle a P2PDMA request now check how to handle these DMA requests is the exact opposite of a layering violation. Expecting every driver that wants to do P2PDMA to have to figure out for themselves how to map the memory before calling into the DMA API doesn't seem like a good design choice to me. > So unfortunately, I think that dma*.c|h is not right place for p2p > type check. dma*.c is already where those checks are done. I'm not sure patches to remove the code from that layer and put it into the NVMe driver would make a lot of sense (and then, of course, we'd have to put it into every other driver that wants to participate in p2p transactions). Logan
On Fri, Jul 25, 2025 at 01:12:35PM -0600, Logan Gunthorpe wrote: > > > On 2025-07-25 12:54, Leon Romanovsky wrote: > >> The solution that would make more sense to me would be for either > >> dma_iova_try_alloc() or another helper in dma-iommu.c to handle the > >> P2PDMA case. dma-iommu.c already uses those same interfaces and thus > >> there would be no need to export the low level helpers from the p2pdma code. > > > > I had same idea in early versions of DMA phys API discussion and it was > > pointed (absolutely right) that this is layering violation. > > Respectfully, I have to disagree with this. Having the layer (ie. > dma-iommu) that normally checks how to handle a P2PDMA request now check > how to handle these DMA requests is the exact opposite of a layering > violation. I'm aware of your implementation and have feeling that it was very influenced by NVMe requirements, so the end result is very tailored for it. Other users have very different paths if p2p is taken. Just see last VFIO patch in this series, it skips all DMA logic. > Expecting every driver that wants to do P2PDMA to have to > figure out for themselves how to map the memory before calling into the > DMA API doesn't seem like a good design choice to me. We had this discussion earlier too on previous versions. The summary is that p2p capable devices are very special anyway. They need to work with p2p natively. BTW, the implementation is not supposed to be in the drivers, but in their respective subsystems. > > > So unfortunately, I think that dma*.c|h is not right place for p2p > > type check. > > dma*.c is already where those checks are done. I'm not sure patches to > remove the code from that layer and put it into the NVMe driver would > make a lot of sense (and then, of course, we'd have to put it into every > other driver that wants to participate in p2p transactions). I don't have plans to remove existing checks right now, but NVMe was already converted to new DMA phys API. Thanks > > Logan > >
© 2016 - 2025 Red Hat, Inc.