From: Leon Romanovsky <leonro@nvidia.com>
Extract the core P2PDMA provider information (device owner and bus
offset) from the dev_pagemap into a dedicated p2pdma_provider structure.
This creates a cleaner separation between the memory management layer and
the P2PDMA functionality.
The new p2pdma_provider structure contains:
- owner: pointer to the providing device
- bus_offset: computed offset for non-host transactions
This refactoring simplifies the P2PDMA state management by removing
the need to access pgmap internals directly. The pci_p2pdma_map_state
now stores a pointer to the provider instead of the pgmap, making
the API more explicit and easier to understand.
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/pci/p2pdma.c | 42 +++++++++++++++++++++-----------------
include/linux/pci-p2pdma.h | 18 ++++++++++++----
2 files changed, 37 insertions(+), 23 deletions(-)
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index fe347ed7fd8f4..5a310026bd24f 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -28,9 +28,8 @@ struct pci_p2pdma {
};
struct pci_p2pdma_pagemap {
- struct pci_dev *provider;
- u64 bus_offset;
struct dev_pagemap pgmap;
+ struct p2pdma_provider mem;
};
static struct pci_p2pdma_pagemap *to_p2p_pgmap(struct dev_pagemap *pgmap)
@@ -204,8 +203,8 @@ static void p2pdma_page_free(struct page *page)
{
struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page_pgmap(page));
/* safe to dereference while a reference is held to the percpu ref */
- struct pci_p2pdma *p2pdma =
- rcu_dereference_protected(pgmap->provider->p2pdma, 1);
+ struct pci_p2pdma *p2pdma = rcu_dereference_protected(
+ to_pci_dev(pgmap->mem.owner)->p2pdma, 1);
struct percpu_ref *ref;
gen_pool_free_owner(p2pdma->pool, (uintptr_t)page_to_virt(page),
@@ -270,14 +269,15 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
static void pci_p2pdma_unmap_mappings(void *data)
{
- struct pci_dev *pdev = data;
+ struct pci_p2pdma_pagemap *p2p_pgmap = data;
/*
* Removing the alloc attribute from sysfs will call
* unmap_mapping_range() on the inode, teardown any existing userspace
* mappings and prevent new ones from being created.
*/
- sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
+ sysfs_remove_file_from_group(&p2p_pgmap->mem.owner->kobj,
+ &p2pmem_alloc_attr.attr,
p2pmem_group.name);
}
@@ -328,10 +328,9 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
pgmap->nr_range = 1;
pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
pgmap->ops = &p2pdma_pgmap_ops;
-
- p2p_pgmap->provider = pdev;
- p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) -
- pci_resource_start(pdev, bar);
+ p2p_pgmap->mem.owner = &pdev->dev;
+ p2p_pgmap->mem.bus_offset =
+ pci_bus_address(pdev, bar) - pci_resource_start(pdev, bar);
addr = devm_memremap_pages(&pdev->dev, pgmap);
if (IS_ERR(addr)) {
@@ -340,7 +339,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
}
error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings,
- pdev);
+ p2p_pgmap);
if (error)
goto pages_free;
@@ -973,16 +972,16 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
}
EXPORT_SYMBOL_GPL(pci_p2pmem_publish);
-static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
- struct device *dev)
+static enum pci_p2pdma_map_type
+pci_p2pdma_map_type(struct p2pdma_provider *provider, struct device *dev)
{
enum pci_p2pdma_map_type type = PCI_P2PDMA_MAP_NOT_SUPPORTED;
- struct pci_dev *provider = to_p2p_pgmap(pgmap)->provider;
+ struct pci_dev *pdev = to_pci_dev(provider->owner);
struct pci_dev *client;
struct pci_p2pdma *p2pdma;
int dist;
- if (!provider->p2pdma)
+ if (!pdev->p2pdma)
return PCI_P2PDMA_MAP_NOT_SUPPORTED;
if (!dev_is_pci(dev))
@@ -991,7 +990,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
client = to_pci_dev(dev);
rcu_read_lock();
- p2pdma = rcu_dereference(provider->p2pdma);
+ p2pdma = rcu_dereference(pdev->p2pdma);
if (p2pdma)
type = xa_to_value(xa_load(&p2pdma->map_types,
@@ -999,7 +998,7 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
rcu_read_unlock();
if (type == PCI_P2PDMA_MAP_UNKNOWN)
- return calc_map_type_and_dist(provider, client, &dist, true);
+ return calc_map_type_and_dist(pdev, client, &dist, true);
return type;
}
@@ -1007,8 +1006,13 @@ static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state,
struct device *dev, struct page *page)
{
- state->pgmap = page_pgmap(page);
- state->map = pci_p2pdma_map_type(state->pgmap, dev);
+ struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page_pgmap(page));
+
+ if (state->mem == &p2p_pgmap->mem)
+ return;
+
+ state->mem = &p2p_pgmap->mem;
+ state->map = pci_p2pdma_map_type(&p2p_pgmap->mem, dev);
}
/**
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index b502fc8b49bf9..27a2c399f47da 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -16,6 +16,16 @@
struct block_device;
struct scatterlist;
+/**
+ * struct p2pdma_provider
+ *
+ * A p2pdma provider is a range of MMIO address space available to the CPU.
+ */
+struct p2pdma_provider {
+ struct device *owner;
+ u64 bus_offset;
+};
+
#ifdef CONFIG_PCI_P2PDMA
int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
u64 offset);
@@ -144,10 +154,11 @@ enum pci_p2pdma_map_type {
};
struct pci_p2pdma_map_state {
- struct dev_pagemap *pgmap;
+ struct p2pdma_provider *mem;
enum pci_p2pdma_map_type map;
};
+
/* helper for pci_p2pdma_state(), do not use directly */
void __pci_p2pdma_update_state(struct pci_p2pdma_map_state *state,
struct device *dev, struct page *page);
@@ -166,8 +177,7 @@ pci_p2pdma_state(struct pci_p2pdma_map_state *state, struct device *dev,
struct page *page)
{
if (IS_ENABLED(CONFIG_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
- if (state->pgmap != page_pgmap(page))
- __pci_p2pdma_update_state(state, dev, page);
+ __pci_p2pdma_update_state(state, dev, page);
return state->map;
}
return PCI_P2PDMA_MAP_NONE;
@@ -185,7 +195,7 @@ static inline dma_addr_t
pci_p2pdma_bus_addr_map(struct pci_p2pdma_map_state *state, phys_addr_t paddr)
{
WARN_ON_ONCE(state->map != PCI_P2PDMA_MAP_BUS_ADDR);
- return paddr + to_p2p_pgmap(state->pgmap)->bus_offsetf;
+ return paddr + state->mem->bus_offset;
}
#endif /* _LINUX_PCI_P2P_H */
--
2.50.1
On Wed, Jul 23, 2025 at 04:00:03PM +0300, Leon Romanovsky wrote: > From: Leon Romanovsky <leonro@nvidia.com> > > Extract the core P2PDMA provider information (device owner and bus > offset) from the dev_pagemap into a dedicated p2pdma_provider structure. > This creates a cleaner separation between the memory management layer and > the P2PDMA functionality. > > The new p2pdma_provider structure contains: > - owner: pointer to the providing device > - bus_offset: computed offset for non-host transactions > > This refactoring simplifies the P2PDMA state management by removing > the need to access pgmap internals directly. The pci_p2pdma_map_state > now stores a pointer to the provider instead of the pgmap, making > the API more explicit and easier to understand. Based on the conversation how about this as a commit message: PCI/P2PDMA: Separate the mmap() support from the core logic Currently the P2PDMA code requires a pgmap and a struct page to function. The was serving three important purposes: - DMA API compatibility, where scatterlist required a struct page as input - Life cycle management, the percpu_ref is used to prevent UAF during device hot unplug - A way to get the P2P provider data through the pci_p2pdma_pagemap The DMA API now has a new flow, and has gained phys_addr_t support, so it no longer needs struct pages to perform P2P mapping. Lifecycle management can be delegated to the user, DMABUF for instance has a suitable invalidation protocol that does not require struct page. Finding the P2P provider data can also be managed by the caller without need to look it up from the phys_addr. Split the P2PDMA code into two layers. The optionl upper layer, effectively, provides a way to mmap() P2P memory into a VMA by providing struct page, pgmap, a genalloc and sysfs. The lower layer provides the actual P2P infrastructure and is wrapped up in a new struct p2pdma_provider. Rework the mmap layer to use new p2pdma_provider based APIs. Drivers that do not want to put P2P memory into VMA's can allocate a struct p2pdma_provider after probe() starts and free it before remove() completes. When DMA mapping the driver must convey the struct p2pdma_provider to the DMA mapping code along with a phys_addr of the MMIO BAR slice to map. The driver must ensure that no DMA mapping outlives the lifetime of the struct p2pdma_provider. The intended target of this new API layer is DMABUF. There is usually only a single p2pdma_provider for a DMABUF exporter. Most drivers can establish the p2pdma_provider during probe, access the single instance during DMABUF attach and use that to drive the DMA mapping. DMABUF provides an invalidation mechanism that can guarentee all DMA is halted and the DMA mappings are undone prior to destroying the struct p2pdma_provider. This ensures there is no UAF through DMABUFs that are lingering past driver removal. The new p2pdma_provider layer cannot be used to create P2P memory that can be mapped into VMA's, be used with pin_user_pages(), O_DIRECT, and so on. These use cases must still use the mmap() layer. The p2pdma_provider layer is principally for DMABUF-like use cases where DMABUF natively manages the life cycle and access instead of vmas/pin_user_pages()/struct page. Jason
On Wed, Jul 23, 2025 at 04:00:03PM +0300, Leon Romanovsky wrote: > From: Leon Romanovsky <leonro@nvidia.com> > > Extract the core P2PDMA provider information (device owner and bus > offset) from the dev_pagemap into a dedicated p2pdma_provider structure. > This creates a cleaner separation between the memory management layer and > the P2PDMA functionality. > > The new p2pdma_provider structure contains: > - owner: pointer to the providing device > - bus_offset: computed offset for non-host transactions > > This refactoring simplifies the P2PDMA state management by removing > the need to access pgmap internals directly. The pci_p2pdma_map_state > now stores a pointer to the provider instead of the pgmap, making > the API more explicit and easier to understand. I really don't see how anything becomes cleaner or simpler here. It adds a new structure that only exists embedded in the exist one and more code for no apparent benefit.
On Thu, Jul 24, 2025 at 09:51:45AM +0200, Christoph Hellwig wrote: > On Wed, Jul 23, 2025 at 04:00:03PM +0300, Leon Romanovsky wrote: > > From: Leon Romanovsky <leonro@nvidia.com> > > > > Extract the core P2PDMA provider information (device owner and bus > > offset) from the dev_pagemap into a dedicated p2pdma_provider structure. > > This creates a cleaner separation between the memory management layer and > > the P2PDMA functionality. > > > > The new p2pdma_provider structure contains: > > - owner: pointer to the providing device > > - bus_offset: computed offset for non-host transactions > > > > This refactoring simplifies the P2PDMA state management by removing > > the need to access pgmap internals directly. The pci_p2pdma_map_state > > now stores a pointer to the provider instead of the pgmap, making > > the API more explicit and easier to understand. > > I really don't see how anything becomes cleaner or simpler here. > It adds a new structure that only exists embedded in the exist one > and more code for no apparent benefit. Please, see last patch in the series https://lore.kernel.org/all/aea452cc27ca9e5169f7279d7b524190c39e7260.1753274085.git.leonro@nvidia.com It gives me a way to call p2p code with stable pointer for whole BAR. Thanks > >
On Thu, Jul 24, 2025 at 10:55:33AM +0300, Leon Romanovsky wrote: > Please, see last patch in the series https://lore.kernel.org/all/aea452cc27ca9e5169f7279d7b524190c39e7260.1753274085.git.leonro@nvidia.com > It gives me a way to call p2p code with stable pointer for whole BAR. > That simply can't work. So I guess you're trying to do the same stupid things shut down before again? I might as well not waste my time reviewing this.
On Thu, Jul 24, 2025 at 09:59:22AM +0200, Christoph Hellwig wrote: > On Thu, Jul 24, 2025 at 10:55:33AM +0300, Leon Romanovsky wrote: > > Please, see last patch in the series https://lore.kernel.org/all/aea452cc27ca9e5169f7279d7b524190c39e7260.1753274085.git.leonro@nvidia.com > > It gives me a way to call p2p code with stable pointer for whole BAR. > > > > That simply can't work. Why not? That's the whole point of this, to remove struct page and use something else as a handle for the p2p when doing the DMA API stuff. The caller must make sure the lifetimes all work out. The handle must live longer than any active DMAs, etc, etc. DMABUF with invalidation lets vfio do that. This is why the DMA api code was taught to use phys_addr_t and not touch the struct page so it could work with struct-pageless memory. The idea was to end up with two layers in the P2P code where the lower layer only works on the handle, and then there is an optional struct page/genalloc/etc layer for places that want struct page and mmap. Jason
On Sun, Jul 27, 2025 at 03:51:58PM -0300, Jason Gunthorpe wrote: > On Thu, Jul 24, 2025 at 09:59:22AM +0200, Christoph Hellwig wrote: > > On Thu, Jul 24, 2025 at 10:55:33AM +0300, Leon Romanovsky wrote: > > > Please, see last patch in the series https://lore.kernel.org/all/aea452cc27ca9e5169f7279d7b524190c39e7260.1753274085.git.leonro@nvidia.com > > > It gives me a way to call p2p code with stable pointer for whole BAR. > > > > > > > That simply can't work. > > Why not? > > That's the whole point of this, to remove struct page and use > something else as a handle for the p2p when doing the DMA API stuff. Because the struct page is the only thing that: a) dma-mapping works on b) is the only place we can discover the routing information, but also more importantly ensure that the underlying page is still present and the device is not hot unplugged, or in a very theoretical worst case replaced by something else.
On Tue, Jul 29, 2025 at 09:52:09AM +0200, Christoph Hellwig wrote: > On Sun, Jul 27, 2025 at 03:51:58PM -0300, Jason Gunthorpe wrote: > > On Thu, Jul 24, 2025 at 09:59:22AM +0200, Christoph Hellwig wrote: > > > On Thu, Jul 24, 2025 at 10:55:33AM +0300, Leon Romanovsky wrote: > > > > Please, see last patch in the series https://lore.kernel.org/all/aea452cc27ca9e5169f7279d7b524190c39e7260.1753274085.git.leonro@nvidia.com > > > > It gives me a way to call p2p code with stable pointer for whole BAR. > > > > > > > > > > That simply can't work. > > > > Why not? > > > > That's the whole point of this, to remove struct page and use > > something else as a handle for the p2p when doing the DMA API stuff. > > Because the struct page is the only thing that: > > a) dma-mapping works on The main point of the "dma-mapping: migrate to physical address-based API" series was to remove the struct page dependencies in the DMA API: https://lore.kernel.org/all/cover.1750854543.git.leon@kernel.org/ If it is not complete, then it needs more fixing. > b) is the only place we can discover the routing information, This patch adds the p2pdma_provider structure to discover the routing information, this is exactly the problem being solved here. > but also more importantly ensure that the underlying page is > still present and the device is not hot unplugged, or in a very > theoretical worst case replaced by something else. I already answered this, for DMABUF the DMABUF invalidation scheme is used to control the lifetime and no DMA mapping outlives the provider, and the provider doesn't outlive the driver. Hotplug works fine. VFIO gets the driver removal callback, it invalidates all the DMABUFs, refuses to re-validate them, destroys the P2P provider, and ends its driver. There is no lifetime issue. Obviously you cannot use the new p2provider mechanism without some kind of protection against use after hot unplug, but it doesn't have to be struct page based. Jason
On Tue, Jul 29, 2025 at 09:52:09AM +0200, Christoph Hellwig wrote: > On Sun, Jul 27, 2025 at 03:51:58PM -0300, Jason Gunthorpe wrote: > > On Thu, Jul 24, 2025 at 09:59:22AM +0200, Christoph Hellwig wrote: > > > On Thu, Jul 24, 2025 at 10:55:33AM +0300, Leon Romanovsky wrote: > > > > Please, see last patch in the series https://lore.kernel.org/all/aea452cc27ca9e5169f7279d7b524190c39e7260.1753274085.git.leonro@nvidia.com > > > > It gives me a way to call p2p code with stable pointer for whole BAR. > > > > > > > > > > That simply can't work. > > > > Why not? > > > > That's the whole point of this, to remove struct page and use > > something else as a handle for the p2p when doing the DMA API stuff. > > Because the struct page is the only thing that: > > a) dma-mapping works on > b) is the only place we can discover the routing information, but also > more importantly ensure that the underlying page is still present > and the device is not hot unplugged, or in a very theoretical worst > case replaced by something else. It is correct in general case, but here we are talking about MMIO memory, which is "connected" to device X and routing information is stable. Thanks > >
On Tue, Jul 29, 2025 at 11:53:36AM +0300, Leon Romanovsky wrote: > > Because the struct page is the only thing that: > > > > a) dma-mapping works on > > b) is the only place we can discover the routing information, but also > > more importantly ensure that the underlying page is still present > > and the device is not hot unplugged, or in a very theoretical worst > > case replaced by something else. > > It is correct in general case, but here we are talking about MMIO > memory, which is "connected" to device X and routing information is > stable. MMIO is literally the only thing we support to P2P to/from as that is how PCIe P2P is defined. And not, it's not stable - devices can be unplugged, and BARs can be reenumerated.
On Tue, Jul 29, 2025 at 12:41:00PM +0200, Christoph Hellwig wrote: > On Tue, Jul 29, 2025 at 11:53:36AM +0300, Leon Romanovsky wrote: > > > Because the struct page is the only thing that: > > > > > > a) dma-mapping works on > > > b) is the only place we can discover the routing information, but also > > > more importantly ensure that the underlying page is still present > > > and the device is not hot unplugged, or in a very theoretical worst > > > case replaced by something else. > > > > It is correct in general case, but here we are talking about MMIO > > memory, which is "connected" to device X and routing information is > > stable. > > MMIO is literally the only thing we support to P2P to/from as that is > how PCIe P2P is defined. And not, it's not stable - devices can be > unplugged, and BARs can be reenumerated. I have a feeling that we are drifting from the current patchset to more general discussion. The whole idea of new DMA API is to provide flexibility to the callers (subsystems) who are perfectly aware of their data and limitations to implement direct addressing natively. In this series, device is controlled by VFIO and DMABUF. It is not possible to unplug it without VFIO notices it. In such case, p2pdma_provider and related routing information (DMABUF) will be reevaluated. So for VFIO + DMABUF, the pointer is very stable. For other cases (general case), the flow is not changed. Users will continue to call to old and well-known pci_p2pdma_state() to calculate p2p type. Thanks >
On Thu, Jul 24, 2025 at 09:59:22AM +0200, Christoph Hellwig wrote: > On Thu, Jul 24, 2025 at 10:55:33AM +0300, Leon Romanovsky wrote: > > Please, see last patch in the series https://lore.kernel.org/all/aea452cc27ca9e5169f7279d7b524190c39e7260.1753274085.git.leonro@nvidia.com > > It gives me a way to call p2p code with stable pointer for whole BAR. > > > > That simply can't work. So I guess you're trying to do the same stupid > things shut down before again? I might as well not waste my time > reviewing this. I'm not aware of anything that is not acceptable in this series. This series focused on replacing dma_map_resource() call from v3 https://lore.kernel.org/all/20250307052248.405803-4-vivek.kasireddy@intel.com/ to proper API. 92 if (!state) { 93 addr = pci_p2pdma_bus_addr_map(provider, phys_vec->paddr); 94 } else if (dma_use_iova(state)) { 95 ret = dma_iova_link(attachment->dev, state, phys_vec->paddr, 0, 96 phys_vec->len, dir, DMA_ATTR_SKIP_CPU_SYNC); 97 if (ret) 98 goto err_free_table; 99 100 ret = dma_iova_sync(attachment->dev, state, 0, phys_vec->len); 101 if (ret) 102 goto err_unmap_dma; 103 104 addr = state->addr; 105 } else { 106 addr = dma_map_phys(attachment->dev, phys_vec->paddr, 107 phys_vec->len, dir, DMA_ATTR_SKIP_CPU_SYNC); 108 ret = dma_mapping_error(attachment->dev, addr); 109 if (ret) 110 goto err_free_table; 111 } Thanks > >
© 2016 - 2025 Red Hat, Inc.