[PATCH] mm/huge_memory: Avoid PMD-size page cache if needed

Gavin Shan posted 1 patch 1 year, 5 months ago
There is a newer version of this series
mm/huge_memory.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
[PATCH] mm/huge_memory: Avoid PMD-size page cache if needed
Posted by Gavin Shan 1 year, 5 months ago
Currently, xarray can't support arbitrary page cache size and the
largest and supported page cache size is defined as MAX_PAGECACHE_ORDER
in commit 099d90642a71 ("mm/filemap: make MAX_PAGECACHE_ORDER acceptable
to xarray"). However, it's possible to have 512MB page cache in the huge
memory collapsing path on ARM64 system whose base page size is 64KB. A
warning is raised when the huge page cache is split as shown in the
following example.

[root@dhcp-10-26-1-207 ~]# cat /proc/1/smaps | grep KernelPageSize
KernelPageSize:       64 kB
[root@dhcp-10-26-1-207 ~]# cat /tmp/test.c
   :
int main(int argc, char **argv)
{
	const char *filename = TEST_XFS_FILENAME;
	int fd = 0;
	void *buf = (void *)-1, *p;
	int pgsize = getpagesize();
	int ret = 0;

	if (pgsize != 0x10000) {
		fprintf(stdout, "System with 64KB base page size is required!\n");
		return -EPERM;
	}

	system("echo 0 > /sys/devices/virtual/bdi/253:0/read_ahead_kb");
	system("echo 1 > /proc/sys/vm/drop_caches");

	/* Open xfs or shmem file */
	fd = open(filename, O_RDONLY);
	assert(fd > 0);

	/* Create VMA */
	buf = mmap(NULL, TEST_MEM_SIZE, PROT_READ, MAP_SHARED, fd, 0);
	assert(buf != (void *)-1);
	fprintf(stdout, "mapped buffer at 0x%p\n", buf);

	/* Populate VMA */
	ret = madvise(buf, TEST_MEM_SIZE, MADV_NOHUGEPAGE);
	assert(ret == 0);
	ret = madvise(buf, TEST_MEM_SIZE, MADV_POPULATE_READ);
	assert(ret == 0);

	/* Collapse VMA */
	ret = madvise(buf, TEST_MEM_SIZE, MADV_HUGEPAGE);
	assert(ret == 0);
	ret = madvise(buf, TEST_MEM_SIZE, MADV_COLLAPSE);
	if (ret) {
		fprintf(stdout, "Error %d to madvise(MADV_COLLAPSE)\n", errno);
		goto out;
	}

	/* Split xarray. The file needs to reopened with write permission */
	munmap(buf, TEST_MEM_SIZE);
	buf = (void *)-1;
	close(fd);
	fd = open(filename, O_RDWR);
	assert(fd > 0);
	fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
 		  TEST_MEM_SIZE - pgsize, pgsize);
out:
	if (buf != (void *)-1)
		munmap(buf, TEST_MEM_SIZE);
	if (fd > 0)
		close(fd);

	return ret;
}

[root@dhcp-10-26-1-207 ~]# gcc /tmp/test.c -o /tmp/test
[root@dhcp-10-26-1-207 ~]# /tmp/test
 ------------[ cut here ]------------
 WARNING: CPU: 25 PID: 7560 at lib/xarray.c:1025 xas_split_alloc+0xf8/0x128
 Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib    \
 nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct      \
 nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4      \
 ip_set rfkill nf_tables nfnetlink vfat fat virtio_balloon drm fuse   \
 xfs libcrc32c crct10dif_ce ghash_ce sha2_ce sha256_arm64 virtio_net  \
 sha1_ce net_failover virtio_blk virtio_console failover dimlib virtio_mmio
 CPU: 25 PID: 7560 Comm: test Kdump: loaded Not tainted 6.10.0-rc7-gavin+ #9
 Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-1.el9 05/24/2024
 pstate: 83400005 (Nzcv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
 pc : xas_split_alloc+0xf8/0x128
 lr : split_huge_page_to_list_to_order+0x1c4/0x780
 sp : ffff8000ac32f660
 x29: ffff8000ac32f660 x28: ffff0000e0969eb0 x27: ffff8000ac32f6c0
 x26: 0000000000000c40 x25: ffff0000e0969eb0 x24: 000000000000000d
 x23: ffff8000ac32f6c0 x22: ffffffdfc0700000 x21: 0000000000000000
 x20: 0000000000000000 x19: ffffffdfc0700000 x18: 0000000000000000
 x17: 0000000000000000 x16: ffffd5f3708ffc70 x15: 0000000000000000
 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
 x11: ffffffffffffffc0 x10: 0000000000000040 x9 : ffffd5f3708e692c
 x8 : 0000000000000003 x7 : 0000000000000000 x6 : ffff0000e0969eb8
 x5 : ffffd5f37289e378 x4 : 0000000000000000 x3 : 0000000000000c40
 x2 : 000000000000000d x1 : 000000000000000c x0 : 0000000000000000
 Call trace:
  xas_split_alloc+0xf8/0x128
  split_huge_page_to_list_to_order+0x1c4/0x780
  truncate_inode_partial_folio+0xdc/0x160
  truncate_inode_pages_range+0x1b4/0x4a8
  truncate_pagecache_range+0x84/0xa0
  xfs_flush_unmap_range+0x70/0x90 [xfs]
  xfs_file_fallocate+0xfc/0x4d8 [xfs]
  vfs_fallocate+0x124/0x2f0
  ksys_fallocate+0x4c/0xa0
  __arm64_sys_fallocate+0x24/0x38
  invoke_syscall.constprop.0+0x7c/0xd8
  do_el0_svc+0xb4/0xd0
  el0_svc+0x44/0x1d8
  el0t_64_sync_handler+0x134/0x150
  el0t_64_sync+0x17c/0x180

Fix it by avoiding PMD-sized page cache in the huge memory collapsing
path. After this patch is applied, the test program fails with error
-EINVAL returned from __thp_vma_allowable_orders() and the madvise()
system call to collapse the page caches.

Fixes: 6b24ca4a1a8d ("mm: Use multi-index entries in the page cache")
Cc: <stable@vger.kernel.org>  # v5.17+
Signed-off-by: Gavin Shan <gshan@redhat.com>
---
 mm/huge_memory.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2120f7478e55..e9335ac667be 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -136,7 +136,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 
 		while (orders) {
 			addr = vma->vm_end - (PAGE_SIZE << order);
-			if (thp_vma_suitable_order(vma, addr, order))
+			if (!(vma->vm_file && order > MAX_PAGECACHE_ORDER) &&
+			    thp_vma_suitable_order(vma, addr, order))
 				break;
 			order = next_order(&orders, order);
 		}
-- 
2.45.2
Re: [PATCH] mm/huge_memory: Avoid PMD-size page cache if needed
Posted by Matthew Wilcox 1 year, 5 months ago
On Thu, Jul 11, 2024 at 08:48:40PM +1000, Gavin Shan wrote:
> +++ b/mm/huge_memory.c
> @@ -136,7 +136,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>  
>  		while (orders) {
>  			addr = vma->vm_end - (PAGE_SIZE << order);
> -			if (thp_vma_suitable_order(vma, addr, order))
> +			if (!(vma->vm_file && order > MAX_PAGECACHE_ORDER) &&
> +			    thp_vma_suitable_order(vma, addr, order))
>  				break;

Why does 'orders' even contain potential orders that are larger than
MAX_PAGECACHE_ORDER?

We do this at the top:

        orders &= vma_is_anonymous(vma) ?
                        THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;

include/linux/huge_mm.h:#define THP_ORDERS_ALL_FILE     (BIT(PMD_ORDER) | BIT(PUD_ORDER))

... and that seems very wrong.  We support all kinds of orders for
files, not just PMD order.  We don't support PUD order at all.

What the hell is going on here?
Re: [PATCH] mm/huge_memory: Avoid PMD-size page cache if needed
Posted by Ryan Roberts 1 year, 5 months ago
On 11/07/2024 21:46, Matthew Wilcox wrote:
> On Thu, Jul 11, 2024 at 08:48:40PM +1000, Gavin Shan wrote:
>> +++ b/mm/huge_memory.c
>> @@ -136,7 +136,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>>  
>>  		while (orders) {
>>  			addr = vma->vm_end - (PAGE_SIZE << order);
>> -			if (thp_vma_suitable_order(vma, addr, order))
>> +			if (!(vma->vm_file && order > MAX_PAGECACHE_ORDER) &&
>> +			    thp_vma_suitable_order(vma, addr, order))
>>  				break;
> 
> Why does 'orders' even contain potential orders that are larger than
> MAX_PAGECACHE_ORDER?
> 
> We do this at the top:
> 
>         orders &= vma_is_anonymous(vma) ?
>                         THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
> 
> include/linux/huge_mm.h:#define THP_ORDERS_ALL_FILE     (BIT(PMD_ORDER) | BIT(PUD_ORDER))
> 
> ... and that seems very wrong.  We support all kinds of orders for
> files, not just PMD order.  We don't support PUD order at all.
> 
> What the hell is going on here?

Just to try to justify this a little, it was my perspective when adding (anon)
mTHP that memory was either anon or file; Anything that populated vma->vm_file
was file, including shmem, DAX, etc. Before my change THP could install PMD size
mappings for anon, and PMD or PUD size mappings for file memory (but yes, PUD
was only really applicable to DAX in practice, I believe).

I agree it would be good to clean this up, but I don't think the current code is
quite as mad as you're implying, Matthew?
Re: [PATCH] mm/huge_memory: Avoid PMD-size page cache if needed
Posted by David Hildenbrand 1 year, 5 months ago
On 11.07.24 22:46, Matthew Wilcox wrote:
> On Thu, Jul 11, 2024 at 08:48:40PM +1000, Gavin Shan wrote:
>> +++ b/mm/huge_memory.c
>> @@ -136,7 +136,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>>   
>>   		while (orders) {
>>   			addr = vma->vm_end - (PAGE_SIZE << order);
>> -			if (thp_vma_suitable_order(vma, addr, order))
>> +			if (!(vma->vm_file && order > MAX_PAGECACHE_ORDER) &&
>> +			    thp_vma_suitable_order(vma, addr, order))
>>   				break;
> 
> Why does 'orders' even contain potential orders that are larger than
> MAX_PAGECACHE_ORDER?
> 
> We do this at the top:
> 
>          orders &= vma_is_anonymous(vma) ?
>                          THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
> 
> include/linux/huge_mm.h:#define THP_ORDERS_ALL_FILE     (BIT(PMD_ORDER) | BIT(PUD_ORDER))
> 
> ... and that seems very wrong.  We support all kinds of orders for
> files, not just PMD order.  We don't support PUD order at all.
> 
> What the hell is going on here?

yes, that's just absolutely confusing. I mentioned it to Ryan lately 
that we should clean that up (I wanted to look into that, but am happy 
if someone else can help).

There should likely be different defines for

DAX (PMD|PUD)

SHMEM (PMD) -- but soon more. Not sure if we want separate ANON_SHMEM 
for the time being. Hm. But shmem is already handles separately, so 
maybe we can just ignore shmem here.

PAGECACHE (1 .. MAX_PAGECACHE_ORDER)

? But it's still unclear to me.

At least DAX must stay special I think, and PAGECACHE should be capped 
at MAX_PAGECACHE_ORDER.

-- 
Cheers,

David / dhildenb
Re: [PATCH] mm/huge_memory: Avoid PMD-size page cache if needed
Posted by Gavin Shan 1 year, 5 months ago
On 7/12/24 7:03 AM, David Hildenbrand wrote:
> On 11.07.24 22:46, Matthew Wilcox wrote:
>> On Thu, Jul 11, 2024 at 08:48:40PM +1000, Gavin Shan wrote:
>>> +++ b/mm/huge_memory.c
>>> @@ -136,7 +136,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>           while (orders) {
>>>               addr = vma->vm_end - (PAGE_SIZE << order);
>>> -            if (thp_vma_suitable_order(vma, addr, order))
>>> +            if (!(vma->vm_file && order > MAX_PAGECACHE_ORDER) &&
>>> +                thp_vma_suitable_order(vma, addr, order))
>>>                   break;
>>
>> Why does 'orders' even contain potential orders that are larger than
>> MAX_PAGECACHE_ORDER?
>>
>> We do this at the top:
>>
>>          orders &= vma_is_anonymous(vma) ?
>>                          THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
>>
>> include/linux/huge_mm.h:#define THP_ORDERS_ALL_FILE     (BIT(PMD_ORDER) | BIT(PUD_ORDER))
>>
>> ... and that seems very wrong.  We support all kinds of orders for
>> files, not just PMD order.  We don't support PUD order at all.
>>
>> What the hell is going on here?
> 
> yes, that's just absolutely confusing. I mentioned it to Ryan lately that we should clean that up (I wanted to look into that, but am happy if someone else can help).
> 
> There should likely be different defines for
> 
> DAX (PMD|PUD)
> 
> SHMEM (PMD) -- but soon more. Not sure if we want separate ANON_SHMEM for the time being. Hm. But shmem is already handles separately, so maybe we can just ignore shmem here.
> 
> PAGECACHE (1 .. MAX_PAGECACHE_ORDER)
> 
> ? But it's still unclear to me.
> 
> At least DAX must stay special I think, and PAGECACHE should be capped at MAX_PAGECACHE_ORDER.
> 

David, I can help to clean it up. Could you please help to confirm the following
changes are exactly what you're suggesting? Hopefully, there are nothing I've missed.
The original issue can be fixed by the changes. With the changes applied, madvise(MADV_COLLAPSE)
returns with errno -22 in the test program.

The fix tag needs to adjusted either.

Fixes: 3485b88390b0 ("mm: thp: introduce multi-size THP sysfs interface")

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2aa986a5cd1b..45909efb0ef0 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -74,7 +74,12 @@ extern struct kobj_attribute shmem_enabled_attr;
  /*
   * Mask of all large folio orders supported for file THP.
   */
-#define THP_ORDERS_ALL_FILE    (BIT(PMD_ORDER) | BIT(PUD_ORDER))
+#define THP_ORDERS_ALL_FILE_DAX                \
+       ((BIT(PMD_ORDER) | BIT(PUD_ORDER)) & (BIT(MAX_PAGECACHE_ORDER + 1) - 1))
+#define THP_ORDERS_ALL_FILE_DEFAULT    \
+       ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))
+#define THP_ORDERS_ALL_FILE            \
+       (THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT)
  
  /*
   * Mask of all large folio orders supported for THP.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2120f7478e55..4690f33afaa6 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -88,9 +88,17 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
         bool smaps = tva_flags & TVA_SMAPS;
         bool in_pf = tva_flags & TVA_IN_PF;
         bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
+       unsigned long supported_orders;
+
         /* Check the intersection of requested and supported orders. */
-       orders &= vma_is_anonymous(vma) ?
-                       THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
+       if (vma_is_anonymous(vma))
+               supported_orders = THP_ORDERS_ALL_ANON;
+       else if (vma_is_dax(vma))
+               supported_orders = THP_ORDERS_ALL_FILE_DAX;
+       else
+               supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
+
+       orders &= supported_orders;
         if (!orders)
                 return 0;

Thanks,
Gavin

Re: [PATCH] mm/huge_memory: Avoid PMD-size page cache if needed
Posted by David Hildenbrand 1 year, 5 months ago
On 12.07.24 07:39, Gavin Shan wrote:
> On 7/12/24 7:03 AM, David Hildenbrand wrote:
>> On 11.07.24 22:46, Matthew Wilcox wrote:
>>> On Thu, Jul 11, 2024 at 08:48:40PM +1000, Gavin Shan wrote:
>>>> +++ b/mm/huge_memory.c
>>>> @@ -136,7 +136,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>>            while (orders) {
>>>>                addr = vma->vm_end - (PAGE_SIZE << order);
>>>> -            if (thp_vma_suitable_order(vma, addr, order))
>>>> +            if (!(vma->vm_file && order > MAX_PAGECACHE_ORDER) &&
>>>> +                thp_vma_suitable_order(vma, addr, order))
>>>>                    break;
>>>
>>> Why does 'orders' even contain potential orders that are larger than
>>> MAX_PAGECACHE_ORDER?
>>>
>>> We do this at the top:
>>>
>>>           orders &= vma_is_anonymous(vma) ?
>>>                           THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
>>>
>>> include/linux/huge_mm.h:#define THP_ORDERS_ALL_FILE     (BIT(PMD_ORDER) | BIT(PUD_ORDER))
>>>
>>> ... and that seems very wrong.  We support all kinds of orders for
>>> files, not just PMD order.  We don't support PUD order at all.
>>>
>>> What the hell is going on here?
>>
>> yes, that's just absolutely confusing. I mentioned it to Ryan lately that we should clean that up (I wanted to look into that, but am happy if someone else can help).
>>
>> There should likely be different defines for
>>
>> DAX (PMD|PUD)
>>
>> SHMEM (PMD) -- but soon more. Not sure if we want separate ANON_SHMEM for the time being. Hm. But shmem is already handles separately, so maybe we can just ignore shmem here.
>>
>> PAGECACHE (1 .. MAX_PAGECACHE_ORDER)
>>
>> ? But it's still unclear to me.
>>
>> At least DAX must stay special I think, and PAGECACHE should be capped at MAX_PAGECACHE_ORDER.
>>
> 
> David, I can help to clean it up. Could you please help to confirm the following

Thanks!

> changes are exactly what you're suggesting? Hopefully, there are nothing I've missed.
> The original issue can be fixed by the changes. With the changes applied, madvise(MADV_COLLAPSE)
> returns with errno -22 in the test program.
> 
> The fix tag needs to adjusted either.
> 
> Fixes: 3485b88390b0 ("mm: thp: introduce multi-size THP sysfs interface")
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 2aa986a5cd1b..45909efb0ef0 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -74,7 +74,12 @@ extern struct kobj_attribute shmem_enabled_attr;
>    /*
>     * Mask of all large folio orders supported for file THP.
>     */
> -#define THP_ORDERS_ALL_FILE    (BIT(PMD_ORDER) | BIT(PUD_ORDER))

DAX doesn't have any MAX_PAGECACHE_ORDER restrictions (like hugetlb). So 
this should be

/*
  * FSDAX never splits folios, so the MAX_PAGECACHE_ORDER limit does not
  * apply here.
  */
THP_ORDERS_ALL_FILE_DAX ((BIT(PMD_ORDER) | BIT(PUD_ORDER))

Something like that

> +#define THP_ORDERS_ALL_FILE_DAX                \
> +       ((BIT(PMD_ORDER) | BIT(PUD_ORDER)) & (BIT(MAX_PAGECACHE_ORDER + 1) - 1))
> +#define THP_ORDERS_ALL_FILE_DEFAULT    \
> +       ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))
> +#define THP_ORDERS_ALL_FILE            \
> +       (THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT)

Maybe we can get rid of THP_ORDERS_ALL_FILE (to prevent abuse) and fixup
THP_ORDERS_ALL instead.

>    
>    /*
>     * Mask of all large folio orders supported for THP.
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 2120f7478e55..4690f33afaa6 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -88,9 +88,17 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>           bool smaps = tva_flags & TVA_SMAPS;
>           bool in_pf = tva_flags & TVA_IN_PF;
>           bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
> +       unsigned long supported_orders;
> +
>           /* Check the intersection of requested and supported orders. */
> -       orders &= vma_is_anonymous(vma) ?
> -                       THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
> +       if (vma_is_anonymous(vma))
> +               supported_orders = THP_ORDERS_ALL_ANON;
> +       else if (vma_is_dax(vma))
> +               supported_orders = THP_ORDERS_ALL_FILE_DAX;
> +       else
> +               supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;

This is what I had in mind.

But, do we have to special-case shmem as well or will that be handled 
correctly?

-- 
Cheers,

David / dhildenb

Re: [PATCH] mm/huge_memory: Avoid PMD-size page cache if needed
Posted by Gavin Shan 1 year, 5 months ago
On 7/13/24 11:03 AM, David Hildenbrand wrote:
> On 12.07.24 07:39, Gavin Shan wrote:
>>
>> David, I can help to clean it up. Could you please help to confirm the following
> 
> Thanks!
> 
>> changes are exactly what you're suggesting? Hopefully, there are nothing I've missed.
>> The original issue can be fixed by the changes. With the changes applied, madvise(MADV_COLLAPSE)
>> returns with errno -22 in the test program.
>>
>> The fix tag needs to adjusted either.
>>
>> Fixes: 3485b88390b0 ("mm: thp: introduce multi-size THP sysfs interface")
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 2aa986a5cd1b..45909efb0ef0 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -74,7 +74,12 @@ extern struct kobj_attribute shmem_enabled_attr;
>>    /*
>>     * Mask of all large folio orders supported for file THP.
>>     */
>> -#define THP_ORDERS_ALL_FILE    (BIT(PMD_ORDER) | BIT(PUD_ORDER))
> 
> DAX doesn't have any MAX_PAGECACHE_ORDER restrictions (like hugetlb). So this should be
> 
> /*
>   * FSDAX never splits folios, so the MAX_PAGECACHE_ORDER limit does not
>   * apply here.
>   */
> THP_ORDERS_ALL_FILE_DAX ((BIT(PMD_ORDER) | BIT(PUD_ORDER))
> 
> Something like that
> 

Ok. It will be corrected in v2.

>> +#define THP_ORDERS_ALL_FILE_DAX                \
>> +       ((BIT(PMD_ORDER) | BIT(PUD_ORDER)) & (BIT(MAX_PAGECACHE_ORDER + 1) - 1))
>> +#define THP_ORDERS_ALL_FILE_DEFAULT    \
>> +       ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))
>> +#define THP_ORDERS_ALL_FILE            \
>> +       (THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT)
> 
> Maybe we can get rid of THP_ORDERS_ALL_FILE (to prevent abuse) and fixup
> THP_ORDERS_ALL instead.
> 

Sure, it will be removed in v2.

>>    /*
>>     * Mask of all large folio orders supported for THP.
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 2120f7478e55..4690f33afaa6 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -88,9 +88,17 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>>           bool smaps = tva_flags & TVA_SMAPS;
>>           bool in_pf = tva_flags & TVA_IN_PF;
>>           bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
>> +       unsigned long supported_orders;
>> +
>>           /* Check the intersection of requested and supported orders. */
>> -       orders &= vma_is_anonymous(vma) ?
>> -                       THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
>> +       if (vma_is_anonymous(vma))
>> +               supported_orders = THP_ORDERS_ALL_ANON;
>> +       else if (vma_is_dax(vma))
>> +               supported_orders = THP_ORDERS_ALL_FILE_DAX;
>> +       else
>> +               supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
> 
> This is what I had in mind.
> 
> But, do we have to special-case shmem as well or will that be handled correctly?
> 

With previous fixes and this one, I don't see there is any missed cases
for shmem to have 512MB page cache, exceeding MAX_PAGECACHE_ORDER. Hopefully,
I don't miss anything from the code inspection.

- regular read/write paths: covered by the previous fixes
- synchronous readahead: covered by the previous fixes
- asynchronous readahead: page size granularity, no huge page
- page fault handling: covered by the previous fixes
- collapsing PTEs to PMD: to be covered by this patch
- swapin: shouldn't have 512MB huge page since we don't have such huge pages during swapout period
- other cases I missed (?)

Thanks,
Gavin

Re: [PATCH] mm/huge_memory: Avoid PMD-size page cache if needed
Posted by Baolin Wang 1 year, 5 months ago

On 2024/7/13 09:03, David Hildenbrand wrote:
> On 12.07.24 07:39, Gavin Shan wrote:
>> On 7/12/24 7:03 AM, David Hildenbrand wrote:
>>> On 11.07.24 22:46, Matthew Wilcox wrote:
>>>> On Thu, Jul 11, 2024 at 08:48:40PM +1000, Gavin Shan wrote:
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -136,7 +136,8 @@ unsigned long __thp_vma_allowable_orders(struct 
>>>>> vm_area_struct *vma,
>>>>>            while (orders) {
>>>>>                addr = vma->vm_end - (PAGE_SIZE << order);
>>>>> -            if (thp_vma_suitable_order(vma, addr, order))
>>>>> +            if (!(vma->vm_file && order > MAX_PAGECACHE_ORDER) &&
>>>>> +                thp_vma_suitable_order(vma, addr, order))
>>>>>                    break;
>>>>
>>>> Why does 'orders' even contain potential orders that are larger than
>>>> MAX_PAGECACHE_ORDER?
>>>>
>>>> We do this at the top:
>>>>
>>>>           orders &= vma_is_anonymous(vma) ?
>>>>                           THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
>>>>
>>>> include/linux/huge_mm.h:#define THP_ORDERS_ALL_FILE     
>>>> (BIT(PMD_ORDER) | BIT(PUD_ORDER))
>>>>
>>>> ... and that seems very wrong.  We support all kinds of orders for
>>>> files, not just PMD order.  We don't support PUD order at all.
>>>>
>>>> What the hell is going on here?
>>>
>>> yes, that's just absolutely confusing. I mentioned it to Ryan lately 
>>> that we should clean that up (I wanted to look into that, but am 
>>> happy if someone else can help).
>>>
>>> There should likely be different defines for
>>>
>>> DAX (PMD|PUD)
>>>
>>> SHMEM (PMD) -- but soon more. Not sure if we want separate ANON_SHMEM 
>>> for the time being. Hm. But shmem is already handles separately, so 
>>> maybe we can just ignore shmem here.
>>>
>>> PAGECACHE (1 .. MAX_PAGECACHE_ORDER)
>>>
>>> ? But it's still unclear to me.
>>>
>>> At least DAX must stay special I think, and PAGECACHE should be 
>>> capped at MAX_PAGECACHE_ORDER.
>>>
>>
>> David, I can help to clean it up. Could you please help to confirm the 
>> following
> 
> Thanks!
> 
>> changes are exactly what you're suggesting? Hopefully, there are 
>> nothing I've missed.
>> The original issue can be fixed by the changes. With the changes 
>> applied, madvise(MADV_COLLAPSE)
>> returns with errno -22 in the test program.
>>
>> The fix tag needs to adjusted either.
>>
>> Fixes: 3485b88390b0 ("mm: thp: introduce multi-size THP sysfs interface")
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 2aa986a5cd1b..45909efb0ef0 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -74,7 +74,12 @@ extern struct kobj_attribute shmem_enabled_attr;
>>    /*
>>     * Mask of all large folio orders supported for file THP.
>>     */
>> -#define THP_ORDERS_ALL_FILE    (BIT(PMD_ORDER) | BIT(PUD_ORDER))
> 
> DAX doesn't have any MAX_PAGECACHE_ORDER restrictions (like hugetlb). So 
> this should be
> 
> /*
>   * FSDAX never splits folios, so the MAX_PAGECACHE_ORDER limit does not
>   * apply here.
>   */
> THP_ORDERS_ALL_FILE_DAX ((BIT(PMD_ORDER) | BIT(PUD_ORDER))
> 
> Something like that
> 
>> +#define THP_ORDERS_ALL_FILE_DAX                \
>> +       ((BIT(PMD_ORDER) | BIT(PUD_ORDER)) & (BIT(MAX_PAGECACHE_ORDER 
>> + 1) - 1))
>> +#define THP_ORDERS_ALL_FILE_DEFAULT    \
>> +       ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))
>> +#define THP_ORDERS_ALL_FILE            \
>> +       (THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT)
> 
> Maybe we can get rid of THP_ORDERS_ALL_FILE (to prevent abuse) and fixup
> THP_ORDERS_ALL instead.
> 
>>    /*
>>     * Mask of all large folio orders supported for THP.
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 2120f7478e55..4690f33afaa6 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -88,9 +88,17 @@ unsigned long __thp_vma_allowable_orders(struct 
>> vm_area_struct *vma,
>>           bool smaps = tva_flags & TVA_SMAPS;
>>           bool in_pf = tva_flags & TVA_IN_PF;
>>           bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
>> +       unsigned long supported_orders;
>> +
>>           /* Check the intersection of requested and supported orders. */
>> -       orders &= vma_is_anonymous(vma) ?
>> -                       THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
>> +       if (vma_is_anonymous(vma))
>> +               supported_orders = THP_ORDERS_ALL_ANON;
>> +       else if (vma_is_dax(vma))
>> +               supported_orders = THP_ORDERS_ALL_FILE_DAX;
>> +       else
>> +               supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
> 
> This is what I had in mind.
> 
> But, do we have to special-case shmem as well or will that be handled 
> correctly?

For anonymous shmem, it is now same as anonymous THP, which can utilize 
THP_ORDERS_ALL_ANON. For tmpfs, we currently only support PMD-sized THP 
(will support more larger orders in the future). Therefore, I think we 
can reuse THP_ORDERS_ALL_ANON for shmem now:

if (vma_is_anonymous(vma) || shmem_file(vma->vm_file)))
	supported_orders = THP_ORDERS_ALL_ANON;
......

Re: [PATCH] mm/huge_memory: Avoid PMD-size page cache if needed
Posted by David Hildenbrand 1 year, 5 months ago
On 13.07.24 06:01, Baolin Wang wrote:
> 
> 
> On 2024/7/13 09:03, David Hildenbrand wrote:
>> On 12.07.24 07:39, Gavin Shan wrote:
>>> On 7/12/24 7:03 AM, David Hildenbrand wrote:
>>>> On 11.07.24 22:46, Matthew Wilcox wrote:
>>>>> On Thu, Jul 11, 2024 at 08:48:40PM +1000, Gavin Shan wrote:
>>>>>> +++ b/mm/huge_memory.c
>>>>>> @@ -136,7 +136,8 @@ unsigned long __thp_vma_allowable_orders(struct
>>>>>> vm_area_struct *vma,
>>>>>>             while (orders) {
>>>>>>                 addr = vma->vm_end - (PAGE_SIZE << order);
>>>>>> -            if (thp_vma_suitable_order(vma, addr, order))
>>>>>> +            if (!(vma->vm_file && order > MAX_PAGECACHE_ORDER) &&
>>>>>> +                thp_vma_suitable_order(vma, addr, order))
>>>>>>                     break;
>>>>>
>>>>> Why does 'orders' even contain potential orders that are larger than
>>>>> MAX_PAGECACHE_ORDER?
>>>>>
>>>>> We do this at the top:
>>>>>
>>>>>            orders &= vma_is_anonymous(vma) ?
>>>>>                            THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
>>>>>
>>>>> include/linux/huge_mm.h:#define THP_ORDERS_ALL_FILE
>>>>> (BIT(PMD_ORDER) | BIT(PUD_ORDER))
>>>>>
>>>>> ... and that seems very wrong.  We support all kinds of orders for
>>>>> files, not just PMD order.  We don't support PUD order at all.
>>>>>
>>>>> What the hell is going on here?
>>>>
>>>> yes, that's just absolutely confusing. I mentioned it to Ryan lately
>>>> that we should clean that up (I wanted to look into that, but am
>>>> happy if someone else can help).
>>>>
>>>> There should likely be different defines for
>>>>
>>>> DAX (PMD|PUD)
>>>>
>>>> SHMEM (PMD) -- but soon more. Not sure if we want separate ANON_SHMEM
>>>> for the time being. Hm. But shmem is already handles separately, so
>>>> maybe we can just ignore shmem here.
>>>>
>>>> PAGECACHE (1 .. MAX_PAGECACHE_ORDER)
>>>>
>>>> ? But it's still unclear to me.
>>>>
>>>> At least DAX must stay special I think, and PAGECACHE should be
>>>> capped at MAX_PAGECACHE_ORDER.
>>>>
>>>
>>> David, I can help to clean it up. Could you please help to confirm the
>>> following
>>
>> Thanks!
>>
>>> changes are exactly what you're suggesting? Hopefully, there are
>>> nothing I've missed.
>>> The original issue can be fixed by the changes. With the changes
>>> applied, madvise(MADV_COLLAPSE)
>>> returns with errno -22 in the test program.
>>>
>>> The fix tag needs to adjusted either.
>>>
>>> Fixes: 3485b88390b0 ("mm: thp: introduce multi-size THP sysfs interface")
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index 2aa986a5cd1b..45909efb0ef0 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -74,7 +74,12 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>     /*
>>>      * Mask of all large folio orders supported for file THP.
>>>      */
>>> -#define THP_ORDERS_ALL_FILE    (BIT(PMD_ORDER) | BIT(PUD_ORDER))
>>
>> DAX doesn't have any MAX_PAGECACHE_ORDER restrictions (like hugetlb). So
>> this should be
>>
>> /*
>>    * FSDAX never splits folios, so the MAX_PAGECACHE_ORDER limit does not
>>    * apply here.
>>    */
>> THP_ORDERS_ALL_FILE_DAX ((BIT(PMD_ORDER) | BIT(PUD_ORDER))
>>
>> Something like that
>>
>>> +#define THP_ORDERS_ALL_FILE_DAX                \
>>> +       ((BIT(PMD_ORDER) | BIT(PUD_ORDER)) & (BIT(MAX_PAGECACHE_ORDER
>>> + 1) - 1))
>>> +#define THP_ORDERS_ALL_FILE_DEFAULT    \
>>> +       ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))
>>> +#define THP_ORDERS_ALL_FILE            \
>>> +       (THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT)
>>
>> Maybe we can get rid of THP_ORDERS_ALL_FILE (to prevent abuse) and fixup
>> THP_ORDERS_ALL instead.
>>
>>>     /*
>>>      * Mask of all large folio orders supported for THP.
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 2120f7478e55..4690f33afaa6 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -88,9 +88,17 @@ unsigned long __thp_vma_allowable_orders(struct
>>> vm_area_struct *vma,
>>>            bool smaps = tva_flags & TVA_SMAPS;
>>>            bool in_pf = tva_flags & TVA_IN_PF;
>>>            bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
>>> +       unsigned long supported_orders;
>>> +
>>>            /* Check the intersection of requested and supported orders. */
>>> -       orders &= vma_is_anonymous(vma) ?
>>> -                       THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
>>> +       if (vma_is_anonymous(vma))
>>> +               supported_orders = THP_ORDERS_ALL_ANON;
>>> +       else if (vma_is_dax(vma))
>>> +               supported_orders = THP_ORDERS_ALL_FILE_DAX;
>>> +       else
>>> +               supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
>>
>> This is what I had in mind.
>>
>> But, do we have to special-case shmem as well or will that be handled
>> correctly?
> 
> For anonymous shmem, it is now same as anonymous THP, which can utilize
> THP_ORDERS_ALL_ANON.
> For tmpfs, we currently only support PMD-sized THP
> (will support more larger orders in the future). Therefore, I think we
> can reuse THP_ORDERS_ALL_ANON for shmem now:
> 
> if (vma_is_anonymous(vma) || shmem_file(vma->vm_file)))
> 	supported_orders = THP_ORDERS_ALL_ANON;
> ......
> 


It should be THP_ORDERS_ALL_FILE_DEFAULT (MAX_PAGECACHE_ORDER imitation 
applies).

-- 
Cheers,

David / dhildenb

Re: [PATCH] mm/huge_memory: Avoid PMD-size page cache if needed
Posted by Baolin Wang 1 year, 5 months ago

On 2024/7/13 12:17, David Hildenbrand wrote:
> On 13.07.24 06:01, Baolin Wang wrote:
>>
>>
>> On 2024/7/13 09:03, David Hildenbrand wrote:
>>> On 12.07.24 07:39, Gavin Shan wrote:
>>>> On 7/12/24 7:03 AM, David Hildenbrand wrote:
>>>>> On 11.07.24 22:46, Matthew Wilcox wrote:
>>>>>> On Thu, Jul 11, 2024 at 08:48:40PM +1000, Gavin Shan wrote:
>>>>>>> +++ b/mm/huge_memory.c
>>>>>>> @@ -136,7 +136,8 @@ unsigned long __thp_vma_allowable_orders(struct
>>>>>>> vm_area_struct *vma,
>>>>>>>             while (orders) {
>>>>>>>                 addr = vma->vm_end - (PAGE_SIZE << order);
>>>>>>> -            if (thp_vma_suitable_order(vma, addr, order))
>>>>>>> +            if (!(vma->vm_file && order > MAX_PAGECACHE_ORDER) &&
>>>>>>> +                thp_vma_suitable_order(vma, addr, order))
>>>>>>>                     break;
>>>>>>
>>>>>> Why does 'orders' even contain potential orders that are larger than
>>>>>> MAX_PAGECACHE_ORDER?
>>>>>>
>>>>>> We do this at the top:
>>>>>>
>>>>>>            orders &= vma_is_anonymous(vma) ?
>>>>>>                            THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
>>>>>>
>>>>>> include/linux/huge_mm.h:#define THP_ORDERS_ALL_FILE
>>>>>> (BIT(PMD_ORDER) | BIT(PUD_ORDER))
>>>>>>
>>>>>> ... and that seems very wrong.  We support all kinds of orders for
>>>>>> files, not just PMD order.  We don't support PUD order at all.
>>>>>>
>>>>>> What the hell is going on here?
>>>>>
>>>>> yes, that's just absolutely confusing. I mentioned it to Ryan lately
>>>>> that we should clean that up (I wanted to look into that, but am
>>>>> happy if someone else can help).
>>>>>
>>>>> There should likely be different defines for
>>>>>
>>>>> DAX (PMD|PUD)
>>>>>
>>>>> SHMEM (PMD) -- but soon more. Not sure if we want separate ANON_SHMEM
>>>>> for the time being. Hm. But shmem is already handles separately, so
>>>>> maybe we can just ignore shmem here.
>>>>>
>>>>> PAGECACHE (1 .. MAX_PAGECACHE_ORDER)
>>>>>
>>>>> ? But it's still unclear to me.
>>>>>
>>>>> At least DAX must stay special I think, and PAGECACHE should be
>>>>> capped at MAX_PAGECACHE_ORDER.
>>>>>
>>>>
>>>> David, I can help to clean it up. Could you please help to confirm the
>>>> following
>>>
>>> Thanks!
>>>
>>>> changes are exactly what you're suggesting? Hopefully, there are
>>>> nothing I've missed.
>>>> The original issue can be fixed by the changes. With the changes
>>>> applied, madvise(MADV_COLLAPSE)
>>>> returns with errno -22 in the test program.
>>>>
>>>> The fix tag needs to adjusted either.
>>>>
>>>> Fixes: 3485b88390b0 ("mm: thp: introduce multi-size THP sysfs 
>>>> interface")
>>>>
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index 2aa986a5cd1b..45909efb0ef0 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -74,7 +74,12 @@ extern struct kobj_attribute shmem_enabled_attr;
>>>>     /*
>>>>      * Mask of all large folio orders supported for file THP.
>>>>      */
>>>> -#define THP_ORDERS_ALL_FILE    (BIT(PMD_ORDER) | BIT(PUD_ORDER))
>>>
>>> DAX doesn't have any MAX_PAGECACHE_ORDER restrictions (like hugetlb). So
>>> this should be
>>>
>>> /*
>>>    * FSDAX never splits folios, so the MAX_PAGECACHE_ORDER limit does 
>>> not
>>>    * apply here.
>>>    */
>>> THP_ORDERS_ALL_FILE_DAX ((BIT(PMD_ORDER) | BIT(PUD_ORDER))
>>>
>>> Something like that
>>>
>>>> +#define THP_ORDERS_ALL_FILE_DAX                \
>>>> +       ((BIT(PMD_ORDER) | BIT(PUD_ORDER)) & (BIT(MAX_PAGECACHE_ORDER
>>>> + 1) - 1))
>>>> +#define THP_ORDERS_ALL_FILE_DEFAULT    \
>>>> +       ((BIT(MAX_PAGECACHE_ORDER + 1) - 1) & ~BIT(0))
>>>> +#define THP_ORDERS_ALL_FILE            \
>>>> +       (THP_ORDERS_ALL_FILE_DAX | THP_ORDERS_ALL_FILE_DEFAULT)
>>>
>>> Maybe we can get rid of THP_ORDERS_ALL_FILE (to prevent abuse) and fixup
>>> THP_ORDERS_ALL instead.
>>>
>>>>     /*
>>>>      * Mask of all large folio orders supported for THP.
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 2120f7478e55..4690f33afaa6 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -88,9 +88,17 @@ unsigned long __thp_vma_allowable_orders(struct
>>>> vm_area_struct *vma,
>>>>            bool smaps = tva_flags & TVA_SMAPS;
>>>>            bool in_pf = tva_flags & TVA_IN_PF;
>>>>            bool enforce_sysfs = tva_flags & TVA_ENFORCE_SYSFS;
>>>> +       unsigned long supported_orders;
>>>> +
>>>>            /* Check the intersection of requested and supported 
>>>> orders. */
>>>> -       orders &= vma_is_anonymous(vma) ?
>>>> -                       THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
>>>> +       if (vma_is_anonymous(vma))
>>>> +               supported_orders = THP_ORDERS_ALL_ANON;
>>>> +       else if (vma_is_dax(vma))
>>>> +               supported_orders = THP_ORDERS_ALL_FILE_DAX;
>>>> +       else
>>>> +               supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
>>>
>>> This is what I had in mind.
>>>
>>> But, do we have to special-case shmem as well or will that be handled
>>> correctly?
>>
>> For anonymous shmem, it is now same as anonymous THP, which can utilize
>> THP_ORDERS_ALL_ANON.
>> For tmpfs, we currently only support PMD-sized THP
>> (will support more larger orders in the future). Therefore, I think we
>> can reuse THP_ORDERS_ALL_ANON for shmem now:
>>
>> if (vma_is_anonymous(vma) || shmem_file(vma->vm_file)))
>>     supported_orders = THP_ORDERS_ALL_ANON;
>> ......
>>
> 
> 
> It should be THP_ORDERS_ALL_FILE_DEFAULT (MAX_PAGECACHE_ORDER imitation 
> applies).

Yes, indeed, I missed MAX_PAGECACHE_ORDER limitation.
Re: [PATCH] mm/huge_memory: Avoid PMD-size page cache if needed
Posted by David Hildenbrand 1 year, 5 months ago
On 11.07.24 23:03, David Hildenbrand wrote:
> On 11.07.24 22:46, Matthew Wilcox wrote:
>> On Thu, Jul 11, 2024 at 08:48:40PM +1000, Gavin Shan wrote:
>>> +++ b/mm/huge_memory.c
>>> @@ -136,7 +136,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
>>>    
>>>    		while (orders) {
>>>    			addr = vma->vm_end - (PAGE_SIZE << order);
>>> -			if (thp_vma_suitable_order(vma, addr, order))
>>> +			if (!(vma->vm_file && order > MAX_PAGECACHE_ORDER) &&
>>> +			    thp_vma_suitable_order(vma, addr, order))
>>>    				break;
>>
>> Why does 'orders' even contain potential orders that are larger than
>> MAX_PAGECACHE_ORDER?
>>
>> We do this at the top:
>>
>>           orders &= vma_is_anonymous(vma) ?
>>                           THP_ORDERS_ALL_ANON : THP_ORDERS_ALL_FILE;
>>
>> include/linux/huge_mm.h:#define THP_ORDERS_ALL_FILE     (BIT(PMD_ORDER) | BIT(PUD_ORDER))
>>
>> ... and that seems very wrong.  We support all kinds of orders for
>> files, not just PMD order.  We don't support PUD order at all.
>>
>> What the hell is going on here?
> 
> yes, that's just absolutely confusing. I mentioned it to Ryan lately
> that we should clean that up (I wanted to look into that, but am happy
> if someone else can help).
> 
> There should likely be different defines for
> 
> DAX (PMD|PUD)
> 
> SHMEM (PMD) -- but soon more. Not sure if we want separate ANON_SHMEM
> for the time being. Hm. But shmem is already handles separately, so
> maybe we can just ignore shmem here.

Correction: of course <= MAX_PAGECACHE_ORDER

But yeah, this needs cleanups

-- 
Cheers,

David / dhildenb