From: Yishai Hadas <yishaih@nvidia.com>
Enable p2pdma on the mlx5 PCI device to allow DMABUF-based peer-to-peer
DMA mappings.
Add implementation of the mmap_get_pfns and pgoff_to_mmap_entry device
operations required for DMABUF support in the mlx5 RDMA driver.
The pgoff_to_mmap_entry operation converts a page offset to the
corresponding rdma_user_mmap_entry by extracting the command and index
from the offset and looking it up in the ucontext's mmap_xa.
The mmap_get_pfns operation retrieves the physical address and length
from the mmap entry and obtains the p2pdma provider for the underlying
PCI device, which is needed for peer-to-peer DMA operations with
DMABUFs.
Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
---
drivers/infiniband/hw/mlx5/main.c | 72 +++++++++++++++++++++++++++++++++++++++
1 file changed, 72 insertions(+)
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index e81080622283..f97c86c96d83 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2446,6 +2446,70 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev,
virt_to_page(dev->mdev->clock_info));
}
+static int phys_addr_to_bar(struct pci_dev *pdev, phys_addr_t pa)
+{
+ resource_size_t start, end;
+ int bar;
+
+ for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
+ /* Skip BARs not present or not memory-mapped */
+ if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
+ continue;
+
+ start = pci_resource_start(pdev, bar);
+ end = pci_resource_end(pdev, bar);
+
+ if (!start || !end)
+ continue;
+
+ if (pa >= start && pa <= end)
+ return bar;
+ }
+
+ return -1;
+}
+
+static int mlx5_ib_mmap_get_pfns(struct rdma_user_mmap_entry *entry,
+ struct dma_buf_phys_vec *phys_vec,
+ struct p2pdma_provider **provider)
+{
+ struct mlx5_user_mmap_entry *mentry = to_mmmap(entry);
+ struct pci_dev *pdev = to_mdev(entry->ucontext->device)->mdev->pdev;
+ int bar;
+
+ phys_vec->paddr = mentry->address;
+ phys_vec->len = entry->npages * PAGE_SIZE;
+
+ bar = phys_addr_to_bar(pdev, phys_vec->paddr);
+ if (bar < 0)
+ return -EINVAL;
+
+ *provider = pcim_p2pdma_provider(pdev, bar);
+ /* If the kernel was not compiled with CONFIG_PCI_P2PDMA the
+ * functionality is not supported.
+ */
+ if (!*provider)
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+static struct rdma_user_mmap_entry *
+mlx5_ib_pgoff_to_mmap_entry(struct ib_ucontext *ucontext, off_t pg_off)
+{
+ unsigned long entry_pgoff;
+ unsigned long idx;
+ u8 command;
+
+ pg_off = pg_off >> PAGE_SHIFT;
+ command = get_command(pg_off);
+ idx = get_extended_index(pg_off);
+
+ entry_pgoff = command << 16 | idx;
+
+ return rdma_user_mmap_entry_get_pgoff(ucontext, entry_pgoff);
+}
+
static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry)
{
struct mlx5_user_mmap_entry *mentry = to_mmmap(entry);
@@ -4360,7 +4424,13 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
if (err)
goto err_mp;
+ err = pcim_p2pdma_init(mdev->pdev);
+ if (err && err != -EOPNOTSUPP)
+ goto err_dd;
+
return 0;
+err_dd:
+ mlx5_ib_data_direct_cleanup(dev);
err_mp:
mlx5_ib_cleanup_multiport_master(dev);
err:
@@ -4412,11 +4482,13 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
.map_mr_sg_pi = mlx5_ib_map_mr_sg_pi,
.mmap = mlx5_ib_mmap,
.mmap_free = mlx5_ib_mmap_free,
+ .mmap_get_pfns = mlx5_ib_mmap_get_pfns,
.modify_cq = mlx5_ib_modify_cq,
.modify_device = mlx5_ib_modify_device,
.modify_port = mlx5_ib_modify_port,
.modify_qp = mlx5_ib_modify_qp,
.modify_srq = mlx5_ib_modify_srq,
+ .pgoff_to_mmap_entry = mlx5_ib_pgoff_to_mmap_entry,
.pre_destroy_cq = mlx5_ib_pre_destroy_cq,
.poll_cq = mlx5_ib_poll_cq,
.post_destroy_cq = mlx5_ib_post_destroy_cq,
--
2.49.0
On Thu, Jan 08, 2026 at 01:11:15PM +0200, Edward Srouji wrote:
> +static int phys_addr_to_bar(struct pci_dev *pdev, phys_addr_t pa)
> +{
> + resource_size_t start, end;
> + int bar;
> +
> + for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
> + /* Skip BARs not present or not memory-mapped */
> + if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
> + continue;
> +
> + start = pci_resource_start(pdev, bar);
> + end = pci_resource_end(pdev, bar);
> +
> + if (!start || !end)
> + continue;
> +
> + if (pa >= start && pa <= end)
> + return bar;
> + }
Don't we know which of the two BARs the mmap entry came from based on
its type? This seems like overkill..
Jason
On 20/01/2026 20:18, Jason Gunthorpe wrote:
> On Thu, Jan 08, 2026 at 01:11:15PM +0200, Edward Srouji wrote:
>> +static int phys_addr_to_bar(struct pci_dev *pdev, phys_addr_t pa)
>> +{
>> + resource_size_t start, end;
>> + int bar;
>> +
>> + for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) {
>> + /* Skip BARs not present or not memory-mapped */
>> + if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM))
>> + continue;
>> +
>> + start = pci_resource_start(pdev, bar);
>> + end = pci_resource_end(pdev, bar);
>> +
>> + if (!start || !end)
>> + continue;
>> +
>> + if (pa >= start && pa <= end)
>> + return bar;
>> + }
>
> Don't we know which of the two BARs the mmap entry came from based on
> its type? This seems like overkill..
>
Actually no.
Currently, a given type can reside on different BARs based on function
type (i.e. PF/SF).
As we don't have any cap/knowledge for the above mapping, we would
prefer the above code which finds the correct bar (for now 0 or 2)
dynamically.
Yishai
© 2016 - 2026 Red Hat, Inc.