[PATCH v2 4/4] vfio-pci: Best-effort huge pfnmaps with !MAP_FIXED mappings

Peter Xu posted 4 patches 2 months ago
[PATCH v2 4/4] vfio-pci: Best-effort huge pfnmaps with !MAP_FIXED mappings
Posted by Peter Xu 2 months ago
This patch enables best-effort mmap() for vfio-pci bars even without
MAP_FIXED, so as to utilize huge pfnmaps as much as possible.  It should
also avoid userspace changes (switching to MAP_FIXED with pre-aligned VA
addresses) to start enabling huge pfnmaps on VFIO bars.

Here the trick is making sure the MMIO PFNs will be aligned with the VAs
allocated from mmap() when !MAP_FIXED, so that whatever returned from
mmap(!MAP_FIXED) of vfio-pci MMIO regions will be automatically suitable
for huge pfnmaps as much as possible.

To achieve that, a custom vfio_device's get_mapping_hint() for vfio-pci
devices is needed.

Note that BAR's MMIO physical addresses should normally be guaranteed to be
BAR-size aligned.  It means the MMIO address will also always be aligned
with vfio-pci's file offset address space, per VFIO_PCI_OFFSET_SHIFT.

With that guaranteed, VA allocator can calculate the alignment with pgoff,
which will be further aligned with the MMIO physical addresses to be mapped
in the VMA later.

So far, stick with the simple plan to rely on the hardware assumption that
should always be true.  Leave it for later if pgoff needs adjustments when
there's a real demand of it when calculating the alignment.

For discussion on the requirement of this feature, see:

https://lore.kernel.org/linux-pci/20250529214414.1508155-1-amastro@fb.com/

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 drivers/vfio/pci/vfio_pci.c      |  1 +
 drivers/vfio/pci/vfio_pci_core.c | 49 ++++++++++++++++++++++++++++++++
 include/linux/vfio_pci_core.h    |  2 ++
 3 files changed, 52 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index ac10f14417f2f..8f29037cee6eb 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -145,6 +145,7 @@ static const struct vfio_device_ops vfio_pci_ops = {
 	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
 	.pasid_attach_ioas	= vfio_iommufd_physical_pasid_attach_ioas,
 	.pasid_detach_ioas	= vfio_iommufd_physical_pasid_detach_ioas,
+	.get_mapping_order	= vfio_pci_core_get_mapping_order,
 };
 
 static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 7dcf5439dedc9..28ab37715acc0 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1640,6 +1640,55 @@ static unsigned long vma_to_pfn(struct vm_area_struct *vma)
 	return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff;
 }
 
+/*
+ * Hint function for mmap() about the size of mapping to be carried out.
+ * This helps to enable huge pfnmaps as much as possible on BAR mappings.
+ *
+ * This function does the minimum check on mmap() parameters to make the
+ * hint valid only. The majority of mmap() sanity check will be done later
+ * in mmap().
+ */
+int vfio_pci_core_get_mapping_order(struct vfio_device *device,
+				    unsigned long pgoff, size_t len)
+{
+	struct vfio_pci_core_device *vdev =
+	    container_of(device, struct vfio_pci_core_device, vdev);
+	struct pci_dev *pdev = vdev->pdev;
+	unsigned int index = pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+	unsigned long req_start;
+	size_t phys_len;
+
+	/* Currently, only bars 0-5 supports huge pfnmap */
+	if (index >= VFIO_PCI_ROM_REGION_INDEX)
+		return 0;
+
+	/*
+	 * NOTE: we're keeping things simple as of now, assuming the
+	 * physical address of BARs (aka, pci_resource_start(pdev, index))
+	 * should always be aligned with pgoff in vfio-pci's address space.
+	 */
+	req_start = (pgoff << PAGE_SHIFT) & ((1UL << VFIO_PCI_OFFSET_SHIFT) - 1);
+	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
+
+	/*
+	 * If this happens, it will probably fail mmap() later.. mapping
+	 * hint isn't important anymore.
+	 */
+	if (req_start >= phys_len)
+		return 0;
+
+	phys_len = MIN(phys_len - req_start, len);
+
+	if (IS_ENABLED(CONFIG_ARCH_SUPPORTS_PUD_PFNMAP) && phys_len >= PUD_SIZE)
+		return PUD_ORDER;
+
+	if (IS_ENABLED(CONFIG_ARCH_SUPPORTS_PMD_PFNMAP) && phys_len >= PMD_SIZE)
+		return PMD_ORDER;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_get_mapping_order);
+
 static vm_fault_t vfio_pci_mmap_huge_fault(struct vm_fault *vmf,
 					   unsigned int order)
 {
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index f541044e42a2a..d320dfacc5681 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -119,6 +119,8 @@ ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
 		size_t count, loff_t *ppos);
 ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
 		size_t count, loff_t *ppos);
+int vfio_pci_core_get_mapping_order(struct vfio_device *device,
+		unsigned long pgoff, size_t len);
 int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma);
 void vfio_pci_core_request(struct vfio_device *core_vdev, unsigned int count);
 int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
-- 
2.50.1
Re: [PATCH v2 4/4] vfio-pci: Best-effort huge pfnmaps with !MAP_FIXED mappings
Posted by Alex Mastro 2 months ago
On Thu, Dec 04, 2025 at 10:10:03AM -0500, Peter Xu wrote:
> To achieve that, a custom vfio_device's get_mapping_hint() for vfio-pci
> devices is needed.

nit get_mapping_order()
Re: [PATCH v2 4/4] vfio-pci: Best-effort huge pfnmaps with !MAP_FIXED mappings
Posted by Jason Gunthorpe 2 months ago
On Thu, Dec 04, 2025 at 10:10:03AM -0500, Peter Xu wrote:

> +/*
> + * Hint function for mmap() about the size of mapping to be carried out.
> + * This helps to enable huge pfnmaps as much as possible on BAR mappings.
> + *
> + * This function does the minimum check on mmap() parameters to make the
> + * hint valid only. The majority of mmap() sanity check will be done later
> + * in mmap().
> + */
> +int vfio_pci_core_get_mapping_order(struct vfio_device *device,
> +				    unsigned long pgoff, size_t len)
> +{
> +	struct vfio_pci_core_device *vdev =
> +	    container_of(device, struct vfio_pci_core_device, vdev);
> +	struct pci_dev *pdev = vdev->pdev;
> +	unsigned int index = pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
> +	unsigned long req_start;
> +	size_t phys_len;
> +
> +	/* Currently, only bars 0-5 supports huge pfnmap */
> +	if (index >= VFIO_PCI_ROM_REGION_INDEX)
> +		return 0;
> +
> +	/*
> +	 * NOTE: we're keeping things simple as of now, assuming the
> +	 * physical address of BARs (aka, pci_resource_start(pdev, index))
> +	 * should always be aligned with pgoff in vfio-pci's address space.
> +	 */
> +	req_start = (pgoff << PAGE_SHIFT) & ((1UL << VFIO_PCI_OFFSET_SHIFT) - 1);
> +	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
> +
> +	/*
> +	 * If this happens, it will probably fail mmap() later.. mapping
> +	 * hint isn't important anymore.
> +	 */
> +	if (req_start >= phys_len)
> +		return 0;
> +
> +	phys_len = MIN(phys_len - req_start, len);
> +
> +	if (IS_ENABLED(CONFIG_ARCH_SUPPORTS_PUD_PFNMAP) && phys_len >= PUD_SIZE)
> +		return PUD_ORDER;
> +
> +	if (IS_ENABLED(CONFIG_ARCH_SUPPORTS_PMD_PFNMAP) && phys_len >= PMD_SIZE)
> +		return PMD_ORDER;
> +

This seems a bit weird, the vma length is already known, it is len,
why do we go to all this trouble to recalculate len in terms of phys?

If the length is wrong the mmap will fail, so there is no issue with
returning a larger order here.

I feel this should just return the order based on pci_resource_len()?

And shouldn't the mm be the one aligning it to what the arch can do
not drives?

Jason
Re: [PATCH v2 4/4] vfio-pci: Best-effort huge pfnmaps with !MAP_FIXED mappings
Posted by Peter Xu 2 months ago
On Sun, Dec 07, 2025 at 12:26:37PM -0400, Jason Gunthorpe wrote:
> On Thu, Dec 04, 2025 at 10:10:03AM -0500, Peter Xu wrote:
> 
> > +/*
> > + * Hint function for mmap() about the size of mapping to be carried out.
> > + * This helps to enable huge pfnmaps as much as possible on BAR mappings.
> > + *
> > + * This function does the minimum check on mmap() parameters to make the
> > + * hint valid only. The majority of mmap() sanity check will be done later
> > + * in mmap().
> > + */
> > +int vfio_pci_core_get_mapping_order(struct vfio_device *device,
> > +				    unsigned long pgoff, size_t len)
> > +{
> > +	struct vfio_pci_core_device *vdev =
> > +	    container_of(device, struct vfio_pci_core_device, vdev);
> > +	struct pci_dev *pdev = vdev->pdev;
> > +	unsigned int index = pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
> > +	unsigned long req_start;
> > +	size_t phys_len;
> > +
> > +	/* Currently, only bars 0-5 supports huge pfnmap */
> > +	if (index >= VFIO_PCI_ROM_REGION_INDEX)
> > +		return 0;
> > +
> > +	/*
> > +	 * NOTE: we're keeping things simple as of now, assuming the
> > +	 * physical address of BARs (aka, pci_resource_start(pdev, index))
> > +	 * should always be aligned with pgoff in vfio-pci's address space.
> > +	 */
> > +	req_start = (pgoff << PAGE_SHIFT) & ((1UL << VFIO_PCI_OFFSET_SHIFT) - 1);
> > +	phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
> > +
> > +	/*
> > +	 * If this happens, it will probably fail mmap() later.. mapping
> > +	 * hint isn't important anymore.
> > +	 */
> > +	if (req_start >= phys_len)
> > +		return 0;
> > +
> > +	phys_len = MIN(phys_len - req_start, len);
> > +
> > +	if (IS_ENABLED(CONFIG_ARCH_SUPPORTS_PUD_PFNMAP) && phys_len >= PUD_SIZE)
> > +		return PUD_ORDER;
> > +
> > +	if (IS_ENABLED(CONFIG_ARCH_SUPPORTS_PMD_PFNMAP) && phys_len >= PMD_SIZE)
> > +		return PMD_ORDER;
> > +
> 
> This seems a bit weird, the vma length is already known, it is len,
> why do we go to all this trouble to recalculate len in terms of phys?
> 
> If the length is wrong the mmap will fail, so there is no issue with
> returning a larger order here.
> 
> I feel this should just return the order based on pci_resource_len()?

IIUC there's a trivial difference when partial of a huge bar is mapped.

Example: 1G bar, map range (pgoff=2M, size=1G-2M).

If we return bar size order, we'd say 1G, however then it means we'll do
the alignment with 1G. __thp_get_unmapped_area() will think it's not
proper, because:

	loff_t off_end = off + len;
	loff_t off_align = round_up(off, size);

	if (off_end <= off_align || (off_end - off_align) < size)
		return 0;

Here what we really want is to map (2M, 1G-2M) with 2M huge, not 1G, nor
4K.

> 
> And shouldn't the mm be the one aligning it to what the arch can do
> not drives?

Note that here checking CONFIG_ARCH_SUPPORTS_P*D_PFNMAP is a vfio behavior,
pairing with the huge_fault() of vfio-pci driver.  It implies if vfio-pci's
huge pfnmap is enabled or not.  If it's not enabled, we don't need to
report larger orders here.

Said that, this is still a valid point, that core mm should likely also
check against the configs when the kernel was built, though it should not
check against CONFIG_ARCH_SUPPORTS_PMD_PFNMAP.. Instead, it should check
HAVE_ARCH_TRANSPARENT_HUGEPAGE*.

But then... I really want to avoid adding more dependencies to THPs in core
mm on pfnmaps.  I used to decouple THP and huge mappings, that series
wasn't going anywhere, but adding these checks will add more dependencies..

Shall I keep it simple to leave it to drivers, until we have something more
solid (I think we need HAVE_ARCH_HUGE_P*D_LEAVES here)?

Even with that config ready, drivers should always still do proper check on
its own (drivers need to support huge pfnmaps here first before reporting
high orders).  So what I can add into core mm to check arch support would
only be an extra layer of safety net, not much real help but burn some cpu
cycles, IMHO...

Thanks,

-- 
Peter Xu
Re: [PATCH v2 4/4] vfio-pci: Best-effort huge pfnmaps with !MAP_FIXED mappings
Posted by Jason Gunthorpe 1 month, 3 weeks ago
On Wed, Dec 10, 2025 at 03:43:43PM -0500, Peter Xu wrote:
> > This seems a bit weird, the vma length is already known, it is len,
> > why do we go to all this trouble to recalculate len in terms of phys?
> > 
> > If the length is wrong the mmap will fail, so there is no issue with
> > returning a larger order here.
> > 
> > I feel this should just return the order based on pci_resource_len()?
> 
> IIUC there's a trivial difference when partial of a huge bar is mapped.
> 
> Example: 1G bar, map range (pgoff=2M, size=1G-2M).
> 
> If we return bar size order, we'd say 1G, however then it means we'll do
> the alignment with 1G. __thp_get_unmapped_area() will think it's not
> proper, because:
> 
> 	loff_t off_end = off + len;
> 	loff_t off_align = round_up(off, size);
> 
> 	if (off_end <= off_align || (off_end - off_align) < size)
> 		return 0;
> 
> Here what we really want is to map (2M, 1G-2M) with 2M huge, not 1G, nor
> 4K.

This was the point of my prior email, the alignment calculation can't
just be 'align to a size'. The core code needs to choose a VA such
that:

   VA % (1 << order) == pg_off % (1 << order)

So if VFIO returns 1G then the VA should be aligned to 2M within a 1G
region. This allows opportunities to increase the alignment as the
mapping continues, eg if the arch supports a 32M 16x2M contiguous page
then we'd get 32M mappings with your example.

The core code should adjust the target order from the driver by:
   lowest of order or size rounded down to a power of two
 then
   highest arch supported leaf size below the above

None of this logic should be in drivers.

The way to think about this is that the driver is returning an order
which indicates the maximum case where:

   VA % (1 << order) == pg_off % (1 << order)

Could be true. Eg a PCI BAR returns an order that is the size of the
BAR because it is always true. Something that stores at most 1G pages
would return 1G pages, etc.

> Note that here checking CONFIG_ARCH_SUPPORTS_P*D_PFNMAP is a vfio behavior,
> pairing with the huge_fault() of vfio-pci driver.  It implies if vfio-pci's
> huge pfnmap is enabled or not.  If it's not enabled, we don't need to
> report larger orders here.

Honestly, I'd ignore this, and I'm not sure VFIO should be testing
those in the huge_fault either. Again the core code should deal with
it.

> Shall I keep it simple to leave it to drivers, until we have something more
> solid (I think we need HAVE_ARCH_HUGE_P*D_LEAVES here)?

Logic like this should not be in drivers.

> Even with that config ready, drivers should always still do proper check on
> its own (drivers need to support huge pfnmaps here first before reporting
> high orders).  

Drivers shouldn't implement this alignment function without also
implementing huge fault, it is pointless. Don't see a reason to add
extra complexity.

Jason
Re: [PATCH v2 4/4] vfio-pci: Best-effort huge pfnmaps with !MAP_FIXED mappings
Posted by Peter Xu 1 month, 3 weeks ago
On Tue, Dec 16, 2025 at 10:42:24AM -0400, Jason Gunthorpe wrote:
> On Wed, Dec 10, 2025 at 03:43:43PM -0500, Peter Xu wrote:
> > > This seems a bit weird, the vma length is already known, it is len,
> > > why do we go to all this trouble to recalculate len in terms of phys?
> > > 
> > > If the length is wrong the mmap will fail, so there is no issue with
> > > returning a larger order here.
> > > 
> > > I feel this should just return the order based on pci_resource_len()?
> > 
> > IIUC there's a trivial difference when partial of a huge bar is mapped.
> > 
> > Example: 1G bar, map range (pgoff=2M, size=1G-2M).
> > 
> > If we return bar size order, we'd say 1G, however then it means we'll do
> > the alignment with 1G. __thp_get_unmapped_area() will think it's not
> > proper, because:
> > 
> > 	loff_t off_end = off + len;
> > 	loff_t off_align = round_up(off, size);
> > 
> > 	if (off_end <= off_align || (off_end - off_align) < size)
> > 		return 0;
> > 
> > Here what we really want is to map (2M, 1G-2M) with 2M huge, not 1G, nor
> > 4K.
> 
> This was the point of my prior email, the alignment calculation can't
> just be 'align to a size'. The core code needs to choose a VA such
> that:

IMHO these are two different things we're discussing here.  I've replied in
the other email about pgoff alignments, I think it's properly done, let's
keep the discussion there.  I'll reply to the other issue raised.

> 
>    VA % (1 << order) == pg_off % (1 << order)
> 
> So if VFIO returns 1G then the VA should be aligned to 2M within a 1G
> region. This allows opportunities to increase the alignment as the
> mapping continues, eg if the arch supports a 32M 16x2M contiguous page
> then we'd get 32M mappings with your example.
> 
> The core code should adjust the target order from the driver by:
>    lowest of order or size rounded down to a power of two
>  then
>    highest arch supported leaf size below the above

Yes, maybe this would be better.

E.g. I would expect if a driver has 32M returned (order=13), then on x86_64
it should be adjusted to 2M (order=9), but on a 4K pgsize arm64 it should
be kept as 32M (order=13) as it matches contpmds.

Do we have any function that we can fetch the best mapping lower than a
specific order?

> 
> None of this logic should be in drivers.

I still think it's the driver's decision to have its own macro controlling
the huge pfnmap behavior.  I agree with you core mm can have it, I don't
see it blocks the driver not returning huge order if huge pfnmap is turned
off.  VFIO-PCI currently indeed only depends directly on global THP
configs, but I don't see why it's strictly needed.  So I think it's fine if
a driver (even if global THP enabled for pmd/pud) deselect huge pfnmap for
other reasons, then here the order returned can still always be PSIZE for
the driver.  It's really not a huge deal to me.

> 
> The way to think about this is that the driver is returning an order
> which indicates the maximum case where:
> 
>    VA % (1 << order) == pg_off % (1 << order)
> 
> Could be true. Eg a PCI BAR returns an order that is the size of the
> BAR because it is always true. Something that stores at most 1G pages
> would return 1G pages, etc.
> 
> > Note that here checking CONFIG_ARCH_SUPPORTS_P*D_PFNMAP is a vfio behavior,
> > pairing with the huge_fault() of vfio-pci driver.  It implies if vfio-pci's
> > huge pfnmap is enabled or not.  If it's not enabled, we don't need to
> > report larger orders here.
> 
> Honestly, I'd ignore this, and I'm not sure VFIO should be testing
> those in the huge_fault either. Again the core code should deal with
> it.
> 
> > Shall I keep it simple to leave it to drivers, until we have something more
> > solid (I think we need HAVE_ARCH_HUGE_P*D_LEAVES here)?
> 
> Logic like this should not be in drivers.
> 
> > Even with that config ready, drivers should always still do proper check on
> > its own (drivers need to support huge pfnmaps here first before reporting
> > high orders).  
> 
> Drivers shouldn't implement this alignment function without also
> implementing huge fault, it is pointless. Don't see a reason to add
> extra complexity.

It's not implementing the order hint without huge fault.  It's when both
are turned off in a kernel config.. then the order hint (even from driver
POV) shouldn't need to be reported.

I don't know why you have so strong feeling on having a config check in
vfio-pci drivers is bad.  I still think it's good to have it pairing with
the same macro in huge_fault(), because it's essentially part of the whole
pfnmap feature so it's fair they're guarded by the same kernel config, but
I'm ok either way in case of current case of vfio-pci where it 100% depends
on global THP setups.

-- 
Peter Xu
Re: [PATCH v2 4/4] vfio-pci: Best-effort huge pfnmaps with !MAP_FIXED mappings
Posted by Jason Gunthorpe 1 month, 3 weeks ago
On Tue, Dec 16, 2025 at 11:01:00AM -0500, Peter Xu wrote:
> Do we have any function that we can fetch the best mapping lower than a
> specific order?

I'm not aware of anything

> > None of this logic should be in drivers.
> 
> I still think it's the driver's decision to have its own macro controlling
> the huge pfnmap behavior.  I agree with you core mm can have it, I don't
> see it blocks the driver not returning huge order if huge pfnmap is turned
> off.  VFIO-PCI currently indeed only depends directly on global THP
> configs, but I don't see why it's strictly needed.  So I think it's fine if
> a driver (even if global THP enabled for pmd/pud) deselect huge pfnmap for
> other reasons, then here the order returned can still always be PSIZE for
> the driver.  It's really not a huge deal to me.

All these APIs should be around the idea that the driver just returns
what it has and the core mm places it into ptes. There is not a good
reason drivers should be overriding this logic or doing their own
thing.

> > Drivers shouldn't implement this alignment function without also
> > implementing huge fault, it is pointless. Don't see a reason to add
> > extra complexity.
> 
> It's not implementing the order hint without huge fault.  It's when both
> are turned off in a kernel config.. then the order hint (even from driver
> POV) shouldn't need to be reported.

No, it should still all be the same the core code just won't call the
function.

> I don't know why you have so strong feeling on having a config check in
> vfio-pci drivers is bad.

It is leaking MM details into drivers that should not be in drivers.

> I still think it's good to have it pairing with the same macro in
> huge_fault(),

It should not be there either.

Jason
Re: [PATCH v2 4/4] vfio-pci: Best-effort huge pfnmaps with !MAP_FIXED mappings
Posted by Peter Xu 1 month, 3 weeks ago
On Tue, Dec 16, 2025 at 03:01:31PM -0400, Jason Gunthorpe wrote:
> On Tue, Dec 16, 2025 at 11:01:00AM -0500, Peter Xu wrote:
> > Do we have any function that we can fetch the best mapping lower than a
> > specific order?
> 
> I'm not aware of anything

Maybe I can introduce a per-arch helper for it, then.  I'll see if I can
cover some tests from ARM side, or I'll enable x86_64 first so we can do it
in two steps.

> 
> > > None of this logic should be in drivers.
> > 
> > I still think it's the driver's decision to have its own macro controlling
> > the huge pfnmap behavior.  I agree with you core mm can have it, I don't
> > see it blocks the driver not returning huge order if huge pfnmap is turned
> > off.  VFIO-PCI currently indeed only depends directly on global THP
> > configs, but I don't see why it's strictly needed.  So I think it's fine if
> > a driver (even if global THP enabled for pmd/pud) deselect huge pfnmap for
> > other reasons, then here the order returned can still always be PSIZE for
> > the driver.  It's really not a huge deal to me.
> 
> All these APIs should be around the idea that the driver just returns
> what it has and the core mm places it into ptes. There is not a good
> reason drivers should be overriding this logic or doing their own
> thing.

I'll make sure the driver will not need to consider size of mapping that
arch would support.

> 
> > > Drivers shouldn't implement this alignment function without also
> > > implementing huge fault, it is pointless. Don't see a reason to add
> > > extra complexity.
> > 
> > It's not implementing the order hint without huge fault.  It's when both
> > are turned off in a kernel config.. then the order hint (even from driver
> > POV) shouldn't need to be reported.
> 
> No, it should still all be the same the core code just won't call the
> function.
> 
> > I don't know why you have so strong feeling on having a config check in
> > vfio-pci drivers is bad.
> 
> It is leaking MM details into drivers that should not be in drivers.

To me it still makes perfect sense here to pair with huge_fault(), and it's
driver knowledge alone.  It has nothing to do with leaking mm details.

I think I get your point above, maybe when the core mm fallback paths not
available yet we can mix things together. I'll see what I can do when
repost.

-- 
Peter Xu
Re: [PATCH v2 4/4] vfio-pci: Best-effort huge pfnmaps with !MAP_FIXED mappings
Posted by kernel test robot 2 months ago
Hi Peter,

kernel test robot noticed the following build warnings:

[auto build test WARNING on awilliam-vfio/for-linus]
[also build test WARNING on v6.18]
[cannot apply to akpm-mm/mm-everything awilliam-vfio/next brauner-vfs/vfs.all linus/master next-20251205]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Peter-Xu/mm-thp-Allow-thp_get_unmapped_area_vmflags-to-take-alignment/20251204-231258
base:   https://github.com/awilliam/linux-vfio.git for-linus
patch link:    https://lore.kernel.org/r/20251204151003.171039-5-peterx%40redhat.com
patch subject: [PATCH v2 4/4] vfio-pci: Best-effort huge pfnmaps with !MAP_FIXED mappings
config: i386-randconfig-006-20251205 (https://download.01.org/0day-ci/archive/20251205/202512051509.bh8Oncoq-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251205/202512051509.bh8Oncoq-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512051509.bh8Oncoq-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> drivers/vfio/pci/vfio_pci_core.c:1670:44: warning: shift count >= width of type [-Wshift-count-overflow]
    1670 |         req_start = (pgoff << PAGE_SHIFT) & ((1UL << VFIO_PCI_OFFSET_SHIFT) - 1);
         |                                                   ^  ~~~~~~~~~~~~~~~~~~~~~
   1 warning generated.


vim +1670 drivers/vfio/pci/vfio_pci_core.c

  1642	
  1643	/*
  1644	 * Hint function for mmap() about the size of mapping to be carried out.
  1645	 * This helps to enable huge pfnmaps as much as possible on BAR mappings.
  1646	 *
  1647	 * This function does the minimum check on mmap() parameters to make the
  1648	 * hint valid only. The majority of mmap() sanity check will be done later
  1649	 * in mmap().
  1650	 */
  1651	int vfio_pci_core_get_mapping_order(struct vfio_device *device,
  1652					    unsigned long pgoff, size_t len)
  1653	{
  1654		struct vfio_pci_core_device *vdev =
  1655		    container_of(device, struct vfio_pci_core_device, vdev);
  1656		struct pci_dev *pdev = vdev->pdev;
  1657		unsigned int index = pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
  1658		unsigned long req_start;
  1659		size_t phys_len;
  1660	
  1661		/* Currently, only bars 0-5 supports huge pfnmap */
  1662		if (index >= VFIO_PCI_ROM_REGION_INDEX)
  1663			return 0;
  1664	
  1665		/*
  1666		 * NOTE: we're keeping things simple as of now, assuming the
  1667		 * physical address of BARs (aka, pci_resource_start(pdev, index))
  1668		 * should always be aligned with pgoff in vfio-pci's address space.
  1669		 */
> 1670		req_start = (pgoff << PAGE_SHIFT) & ((1UL << VFIO_PCI_OFFSET_SHIFT) - 1);
  1671		phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
  1672	
  1673		/*
  1674		 * If this happens, it will probably fail mmap() later.. mapping
  1675		 * hint isn't important anymore.
  1676		 */
  1677		if (req_start >= phys_len)
  1678			return 0;
  1679	
  1680		phys_len = MIN(phys_len - req_start, len);
  1681	
  1682		if (IS_ENABLED(CONFIG_ARCH_SUPPORTS_PUD_PFNMAP) && phys_len >= PUD_SIZE)
  1683			return PUD_ORDER;
  1684	
  1685		if (IS_ENABLED(CONFIG_ARCH_SUPPORTS_PMD_PFNMAP) && phys_len >= PMD_SIZE)
  1686			return PMD_ORDER;
  1687	
  1688		return 0;
  1689	}
  1690	EXPORT_SYMBOL_GPL(vfio_pci_core_get_mapping_order);
  1691	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH v2 4/4] vfio-pci: Best-effort huge pfnmaps with !MAP_FIXED mappings
Posted by kernel test robot 2 months ago
Hi Peter,

kernel test robot noticed the following build warnings:

[auto build test WARNING on awilliam-vfio/for-linus]
[also build test WARNING on linus/master v6.18]
[cannot apply to akpm-mm/mm-everything awilliam-vfio/next brauner-vfs/vfs.all next-20251204]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Peter-Xu/mm-thp-Allow-thp_get_unmapped_area_vmflags-to-take-alignment/20251204-231258
base:   https://github.com/awilliam/linux-vfio.git for-linus
patch link:    https://lore.kernel.org/r/20251204151003.171039-5-peterx%40redhat.com
patch subject: [PATCH v2 4/4] vfio-pci: Best-effort huge pfnmaps with !MAP_FIXED mappings
config: i386-buildonly-randconfig-004-20251205 (https://download.01.org/0day-ci/archive/20251205/202512051241.QtfYgqkx-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251205/202512051241.QtfYgqkx-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512051241.QtfYgqkx-lkp@intel.com/

All warnings (new ones prefixed by >>):

   drivers/vfio/pci/vfio_pci_core.c: In function 'vfio_pci_core_get_mapping_order':
>> drivers/vfio/pci/vfio_pci_core.c:1670:51: warning: left shift count >= width of type [-Wshift-count-overflow]
    1670 |         req_start = (pgoff << PAGE_SHIFT) & ((1UL << VFIO_PCI_OFFSET_SHIFT) - 1);
         |                                                   ^~


vim +1670 drivers/vfio/pci/vfio_pci_core.c

  1642	
  1643	/*
  1644	 * Hint function for mmap() about the size of mapping to be carried out.
  1645	 * This helps to enable huge pfnmaps as much as possible on BAR mappings.
  1646	 *
  1647	 * This function does the minimum check on mmap() parameters to make the
  1648	 * hint valid only. The majority of mmap() sanity check will be done later
  1649	 * in mmap().
  1650	 */
  1651	int vfio_pci_core_get_mapping_order(struct vfio_device *device,
  1652					    unsigned long pgoff, size_t len)
  1653	{
  1654		struct vfio_pci_core_device *vdev =
  1655		    container_of(device, struct vfio_pci_core_device, vdev);
  1656		struct pci_dev *pdev = vdev->pdev;
  1657		unsigned int index = pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
  1658		unsigned long req_start;
  1659		size_t phys_len;
  1660	
  1661		/* Currently, only bars 0-5 supports huge pfnmap */
  1662		if (index >= VFIO_PCI_ROM_REGION_INDEX)
  1663			return 0;
  1664	
  1665		/*
  1666		 * NOTE: we're keeping things simple as of now, assuming the
  1667		 * physical address of BARs (aka, pci_resource_start(pdev, index))
  1668		 * should always be aligned with pgoff in vfio-pci's address space.
  1669		 */
> 1670		req_start = (pgoff << PAGE_SHIFT) & ((1UL << VFIO_PCI_OFFSET_SHIFT) - 1);
  1671		phys_len = PAGE_ALIGN(pci_resource_len(pdev, index));
  1672	
  1673		/*
  1674		 * If this happens, it will probably fail mmap() later.. mapping
  1675		 * hint isn't important anymore.
  1676		 */
  1677		if (req_start >= phys_len)
  1678			return 0;
  1679	
  1680		phys_len = MIN(phys_len - req_start, len);
  1681	
  1682		if (IS_ENABLED(CONFIG_ARCH_SUPPORTS_PUD_PFNMAP) && phys_len >= PUD_SIZE)
  1683			return PUD_ORDER;
  1684	
  1685		if (IS_ENABLED(CONFIG_ARCH_SUPPORTS_PMD_PFNMAP) && phys_len >= PMD_SIZE)
  1686			return PMD_ORDER;
  1687	
  1688		return 0;
  1689	}
  1690	EXPORT_SYMBOL_GPL(vfio_pci_core_get_mapping_order);
  1691	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki