mm/memory.c | 72 ++++++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 34 deletions(-)
Suppose a VMA of size 64K, maps a file/shmem-file at index 0, and that the
pagecache at index 0 contains an order-9 folio. If do_fault_around is able
to find this folio, filemap_map_pages ultimately maps the first 64K half of
the folio into the pagetable, thus reducing the number of future page
faults. If fault-around fails to satisfy the fault, or if it is a write
fault, then we use vma->vm_ops->fault to find/create the folio, followed by
finish_fault to map the folio into the pagetable. On encountering a similar
large folio crossing the VMA boundary, finish_fault currently falls back
to mapping only a single page.
Align finish_fault with filemap_map_pages, and map as many pages as
possible, without crossing VMA/PMD/file boundaries.
Commit 19773df031bc ("mm/fault: try to map the entire file folio in
finish_fault()") argues that doing a per-page fault only prevents RSS
accounting, and not RSS inflation. Combining with the improvements below,
it makes sense to map as maximum pages as possible.
We test the patch with the following userspace program. A shmem VMA of
2M is created, and faulted in, with sysfs setting
hugepages-2048k/shmem_enabled = always, so that the pagecache is populated
with a 2M folio. Then, a 64K VMA is created, and we fault on each page.
Then, we do MADV_DONTNEED to zap the pagetable, so that we can fault again
in the next iteration. We measure the accumulated time taken during
faulting the VMA.
On arm64,
without patch:
Total time taken by inner loop: 4701721766 ns
with patch:
Total time taken by inner loop: 516043507 ns
giving a 9x improvement.
To remove arm64 contpte interference (although contpte will worsen the
execution time due to not accessing mapped memory, but incurring the
overhead of painting the ptes with cont bit), we can change the program to
map a 32K VMA, and do the fault 8 times. For this case:
without patch:
Total time taken by inner loop: 2081356415 ns
with patch:
Total time taken by inner loop: 408755218 ns
leading to an improvement as well.
#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <assert.h>
#include <time.h>
#include <errno.h>
#include <string.h>
#include <sys/syscall.h>
#include <linux/memfd.h>
#include <fcntl.h>
#define PMDSIZE (1UL << 21)
#define PAGESIZE (1UL << 12)
#define MTHPSIZE (1UL << 16)
#define ITERATIONS 1000000
static int xmemfd_create(const char *name, unsigned int flags)
{
#ifdef SYS_memfd_create
return syscall(SYS_memfd_create, name, flags);
#else
(void)name; (void)flags;
errno = ENOSYS;
return -1;
#endif
}
static void die(const char *msg)
{
fprintf(stderr, "%s: %s (errno=%d)\n", msg, strerror(errno), errno);
exit(1);
}
int main(void)
{
/* Create a shmem-backed "file" (anonymous, RAM/swap-backed) */
int fd = xmemfd_create("willitscale-shmem", MFD_CLOEXEC);
if (fd < 0)
die("memfd_create");
if (ftruncate(fd, PMDSIZE) != 0)
die("ftruncate");
char *c = mmap(NULL, PMDSIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (c == MAP_FAILED)
die("mmap PMDSIZE");
assert(!((unsigned long)c & (PMDSIZE - 1)));
/* allocate PMD shmem folio */
c[0] = 0;
char *ptr = mmap(NULL, MTHPSIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
if (ptr == MAP_FAILED)
die("mmap MTHPSIZE");
assert(!((unsigned long)ptr & (MTHPSIZE - 1)));
long total_time_ns = 0;
for (int i = 0; i < ITERATIONS; ++i) {
struct timespec start, end;
if (clock_gettime(CLOCK_MONOTONIC, &start) != 0)
die("clock_gettime start");
for (int j = 0; j < 8; ++j) {
ptr[j * PAGESIZE] = 3;
}
if (clock_gettime(CLOCK_MONOTONIC, &end) != 0)
die("clock_gettime end");
long elapsed_ns =
(end.tv_sec - start.tv_sec) * 1000000000L +
(end.tv_nsec - start.tv_nsec);
total_time_ns += elapsed_ns;
assert(madvise(ptr, MTHPSIZE, MADV_DONTNEED) == 0);
}
printf("Total time taken by inner loop: %ld ns\n", total_time_ns);
munmap(ptr, MTHPSIZE);
munmap(c, PMDSIZE);
close(fd);
return 0;
}
Signed-off-by: Dev Jain <dev.jain@arm.com>
---
Based on mm-unstable (6873c4e2723d). mm-selftests pass.
mm/memory.c | 72 ++++++++++++++++++++++++++++-------------------------
1 file changed, 38 insertions(+), 34 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index 79ba525671c7..b3d951573076 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5563,11 +5563,14 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
bool is_cow = (vmf->flags & FAULT_FLAG_WRITE) &&
!(vma->vm_flags & VM_SHARED);
int type, nr_pages;
- unsigned long addr;
- bool needs_fallback = false;
+ unsigned long start_addr;
+ bool single_page_fallback = false;
+ bool try_pmd_mapping = true;
+ pgoff_t file_end;
+ struct address_space *mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
fallback:
- addr = vmf->address;
+ start_addr = vmf->address;
/* Did we COW the page? */
if (is_cow)
@@ -5586,25 +5589,22 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
return ret;
}
- if (!needs_fallback && vma->vm_file) {
- struct address_space *mapping = vma->vm_file->f_mapping;
- pgoff_t file_end;
-
+ if (!single_page_fallback && mapping) {
file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
/*
- * Do not allow to map with PTEs beyond i_size and with PMD
- * across i_size to preserve SIGBUS semantics.
+ * Do not allow to map with PMD across i_size to preserve
+ * SIGBUS semantics.
*
* Make an exception for shmem/tmpfs that for long time
* intentionally mapped with PMDs across i_size.
*/
- needs_fallback = !shmem_mapping(mapping) &&
- file_end < folio_next_index(folio);
+ try_pmd_mapping = shmem_mapping(mapping) ||
+ file_end >= folio_next_index(folio);
}
if (pmd_none(*vmf->pmd)) {
- if (!needs_fallback && folio_test_pmd_mappable(folio)) {
+ if (try_pmd_mapping && folio_test_pmd_mappable(folio)) {
ret = do_set_pmd(vmf, folio, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
@@ -5619,49 +5619,53 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
nr_pages = folio_nr_pages(folio);
/* Using per-page fault to maintain the uffd semantics */
- if (unlikely(userfaultfd_armed(vma)) || unlikely(needs_fallback)) {
+ if (unlikely(userfaultfd_armed(vma)) || unlikely(single_page_fallback)) {
nr_pages = 1;
} else if (nr_pages > 1) {
- pgoff_t idx = folio_page_idx(folio, page);
- /* The page offset of vmf->address within the VMA. */
- pgoff_t vma_off = vmf->pgoff - vmf->vma->vm_pgoff;
- /* The index of the entry in the pagetable for fault page. */
- pgoff_t pte_off = pte_index(vmf->address);
+
+ /* Ensure mapping stays within VMA and PMD boundaries */
+ unsigned long pmd_boundary_start = ALIGN_DOWN(vmf->address, PMD_SIZE);
+ unsigned long pmd_boundary_end = pmd_boundary_start + PMD_SIZE;
+ unsigned long va_of_folio_start = vmf->address - ((vmf->pgoff - folio->index) * PAGE_SIZE);
+ unsigned long va_of_folio_end = va_of_folio_start + nr_pages * PAGE_SIZE;
+ unsigned long end_addr;
+
+ start_addr = max3(vma->vm_start, pmd_boundary_start, va_of_folio_start);
+ end_addr = min3(vma->vm_end, pmd_boundary_end, va_of_folio_end);
/*
- * Fallback to per-page fault in case the folio size in page
- * cache beyond the VMA limits and PMD pagetable limits.
+ * Do not allow to map with PTEs across i_size to preserve
+ * SIGBUS semantics.
+ *
+ * Make an exception for shmem/tmpfs that for long time
+ * intentionally mapped with PMDs across i_size.
*/
- if (unlikely(vma_off < idx ||
- vma_off + (nr_pages - idx) > vma_pages(vma) ||
- pte_off < idx ||
- pte_off + (nr_pages - idx) > PTRS_PER_PTE)) {
- nr_pages = 1;
- } else {
- /* Now we can set mappings for the whole large folio. */
- addr = vmf->address - idx * PAGE_SIZE;
- page = &folio->page;
- }
+ if (mapping && !shmem_mapping(mapping))
+ end_addr = min(end_addr, va_of_folio_start + (file_end - folio->index) * PAGE_SIZE);
+
+ nr_pages = (end_addr - start_addr) >> PAGE_SHIFT;
+ page = folio_page(folio, (start_addr - va_of_folio_start) >> PAGE_SHIFT);
}
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
- addr, &vmf->ptl);
+ start_addr, &vmf->ptl);
if (!vmf->pte)
return VM_FAULT_NOPAGE;
/* Re-check under ptl */
if (nr_pages == 1 && unlikely(vmf_pte_changed(vmf))) {
- update_mmu_tlb(vma, addr, vmf->pte);
+ update_mmu_tlb(vma, start_addr, vmf->pte);
ret = VM_FAULT_NOPAGE;
goto unlock;
} else if (nr_pages > 1 && !pte_range_none(vmf->pte, nr_pages)) {
- needs_fallback = true;
+ single_page_fallback = true;
+ try_pmd_mapping = false;
pte_unmap_unlock(vmf->pte, vmf->ptl);
goto fallback;
}
folio_ref_add(folio, nr_pages - 1);
- set_pte_range(vmf, folio, page, nr_pages, addr);
+ set_pte_range(vmf, folio, page, nr_pages, start_addr);
type = is_cow ? MM_ANONPAGES : mm_counter_file(folio);
add_mm_counter(vma->vm_mm, type, nr_pages);
ret = 0;
--
2.34.1
syzbot ci has tested the following series [v1] mm: map maximum pages possible in finish_fault https://lore.kernel.org/all/20260206135648.38164-1-dev.jain@arm.com * [PATCH] mm: map maximum pages possible in finish_fault and found the following issue: WARNING in folio_add_file_rmap_ptes Full report is available here: https://ci.syzbot.org/series/72be5e3b-2ee7-4758-9574-5c09e110d6d0 *** WARNING in folio_add_file_rmap_ptes tree: mm-new URL: https://kernel.googlesource.com/pub/scm/linux/kernel/git/akpm/mm.git base: 7235a985b8126d891dbd4e7f20546ff3002b6363 arch: amd64 compiler: Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8 config: https://ci.syzbot.org/builds/7bf34a11-2b89-440d-9047-d9a8b7d52cbc/config C repro: https://ci.syzbot.org/findings/b307b537-ffd9-403f-b188-852c8f39be62/c_repro syz repro: https://ci.syzbot.org/findings/b307b537-ffd9-403f-b188-852c8f39be62/syz_repro ------------[ cut here ]------------ nr_pages <= 0 WARNING: ./include/linux/rmap.h:349 at __folio_rmap_sanity_checks include/linux/rmap.h:349 [inline], CPU#0: syz.0.17/5977 WARNING: ./include/linux/rmap.h:349 at __folio_add_rmap mm/rmap.c:1350 [inline], CPU#0: syz.0.17/5977 WARNING: ./include/linux/rmap.h:349 at __folio_add_file_rmap mm/rmap.c:1696 [inline], CPU#0: syz.0.17/5977 WARNING: ./include/linux/rmap.h:349 at folio_add_file_rmap_ptes+0x98a/0xe60 mm/rmap.c:1722, CPU#0: syz.0.17/5977 Modules linked in: CPU: 0 UID: 0 PID: 5977 Comm: syz.0.17 Not tainted syzkaller #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014 RIP: 0010:__folio_rmap_sanity_checks include/linux/rmap.h:349 [inline] RIP: 0010:__folio_add_rmap mm/rmap.c:1350 [inline] RIP: 0010:__folio_add_file_rmap mm/rmap.c:1696 [inline] RIP: 0010:folio_add_file_rmap_ptes+0x98a/0xe60 mm/rmap.c:1722 Code: 0b 90 e9 4c f7 ff ff e8 84 d1 ab ff 48 89 df 48 c7 c6 e0 44 9a 8b e8 d5 3e 11 ff 90 0f 0b 90 e9 9d f7 ff ff e8 67 d1 ab ff 90 <0f> 0b 90 e9 b2 f7 ff ff e8 59 d1 ab ff 49 ff cf e9 ea f7 ff ff e8 RSP: 0018:ffffc90003f675f0 EFLAGS: 00010293 RAX: ffffffff8216c889 RBX: ffffea000412fa00 RCX: ffff8881170c3a80 RDX: 0000000000000000 RSI: 00000000fb412000 RDI: 0000000000000001 RBP: 00000000fb412000 R08: f888304bee000000 R09: 1ffffd4000825f46 R10: dffffc0000000000 R11: fffff94000825f47 R12: dffffc0000000000 R13: ffffea000412fa30 R14: ffffea000412fa00 R15: 0000000000104be8 FS: 0000555589649500(0000) GS:ffff88818e324000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000200000000040 CR3: 0000000114846000 CR4: 00000000000006f0 Call Trace: <TASK> set_pte_range+0x538/0x8a0 mm/memory.c:5526 finish_fault+0xec7/0x1290 mm/memory.c:5668 do_read_fault mm/memory.c:5808 [inline] do_fault mm/memory.c:5938 [inline] do_pte_missing+0x216f/0x3210 mm/memory.c:4479 handle_pte_fault mm/memory.c:6322 [inline] __handle_mm_fault mm/memory.c:6460 [inline] handle_mm_fault+0x1b8c/0x32b0 mm/memory.c:6629 do_user_addr_fault+0x75b/0x1340 arch/x86/mm/fault.c:1387 handle_page_fault arch/x86/mm/fault.c:1476 [inline] exc_page_fault+0x6a/0xc0 arch/x86/mm/fault.c:1532 asm_exc_page_fault+0x26/0x30 arch/x86/include/asm/idtentry.h:618 RIP: 0010:rep_movs_alternative+0x51/0x90 arch/x86/lib/copy_user_64.S:81 Code: 84 00 00 00 00 00 0f 1f 00 48 8b 06 48 89 07 48 83 c6 08 48 83 c7 08 83 e9 08 74 db 83 f9 08 73 e8 eb c5 eb 05 c3 cc cc cc cc <48> 8b 06 48 89 07 48 8d 47 08 48 83 e0 f8 48 29 f8 48 01 c7 48 01 RSP: 0018:ffffc90003f67cf8 EFLAGS: 00010202 RAX: 00007ffffffff001 RBX: 0000000000000050 RCX: 0000000000000050 RDX: 0000000000000001 RSI: 0000200000000040 RDI: ffffc90003f67d60 RBP: ffffc90003f67eb8 R08: ffffc90003f67daf R09: 1ffff920007ecfb5 R10: dffffc0000000000 R11: fffff520007ecfb6 R12: dffffc0000000000 R13: 0000000000000050 R14: ffffc90003f67d60 R15: 0000200000000040 copy_user_generic arch/x86/include/asm/uaccess_64.h:126 [inline] raw_copy_from_user arch/x86/include/asm/uaccess_64.h:141 [inline] _inline_copy_from_user include/linux/uaccess.h:185 [inline] _copy_from_user+0x7a/0xb0 lib/usercopy.c:18 copy_from_user include/linux/uaccess.h:223 [inline] copy_from_bpfptr_offset include/linux/bpfptr.h:53 [inline] copy_from_bpfptr include/linux/bpfptr.h:59 [inline] __sys_bpf+0x229/0x920 kernel/bpf/syscall.c:6137 __do_sys_bpf kernel/bpf/syscall.c:6274 [inline] __se_sys_bpf kernel/bpf/syscall.c:6272 [inline] __x64_sys_bpf+0x7c/0x90 kernel/bpf/syscall.c:6272 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xe2/0xf80 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f4fb5d9acb9 Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffe24ca9848 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 00007f4fb6015fa0 RCX: 00007f4fb5d9acb9 RDX: 0000000000000050 RSI: 0000200000000040 RDI: 0000000000000000 RBP: 00007f4fb5e08bf7 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007f4fb6015fac R14: 00007f4fb6015fa0 R15: 00007f4fb6015fa0 </TASK> ---------------- Code disassembly (best guess): 0: 84 00 test %al,(%rax) 2: 00 00 add %al,(%rax) 4: 00 00 add %al,(%rax) 6: 0f 1f 00 nopl (%rax) 9: 48 8b 06 mov (%rsi),%rax c: 48 89 07 mov %rax,(%rdi) f: 48 83 c6 08 add $0x8,%rsi 13: 48 83 c7 08 add $0x8,%rdi 17: 83 e9 08 sub $0x8,%ecx 1a: 74 db je 0xfffffff7 1c: 83 f9 08 cmp $0x8,%ecx 1f: 73 e8 jae 0x9 21: eb c5 jmp 0xffffffe8 23: eb 05 jmp 0x2a 25: c3 ret 26: cc int3 27: cc int3 28: cc int3 29: cc int3 * 2a: 48 8b 06 mov (%rsi),%rax <-- trapping instruction 2d: 48 89 07 mov %rax,(%rdi) 30: 48 8d 47 08 lea 0x8(%rdi),%rax 34: 48 83 e0 f8 and $0xfffffffffffffff8,%rax 38: 48 29 f8 sub %rdi,%rax 3b: 48 01 c7 add %rax,%rdi 3e: 48 rex.W 3f: 01 .byte 0x1 *** If these findings have caused you to resend the series or submit a separate fix, please add the following tag to your commit message: Tested-by: syzbot@syzkaller.appspotmail.com --- This report is generated by a bot. It may contain errors. syzbot ci engineers can be reached at syzkaller@googlegroups.com.
On Fri, Feb 06, 2026 at 07:26:48PM +0530, Dev Jain wrote: > We test the patch with the following userspace program. A shmem VMA of > 2M is created, and faulted in, with sysfs setting > hugepages-2048k/shmem_enabled = always, so that the pagecache is populated > with a 2M folio. Then, a 64K VMA is created, and we fault on each page. > Then, we do MADV_DONTNEED to zap the pagetable, so that we can fault again > in the next iteration. We measure the accumulated time taken during > faulting the VMA. > > On arm64, > > without patch: > Total time taken by inner loop: 4701721766 ns > > with patch: > Total time taken by inner loop: 516043507 ns > > giving a 9x improvement. It's nice that you can construct a test-case that shows improvement, but is there any real workload that benefits from this?
© 2016 - 2026 Red Hat, Inc.