arch/alpha/include/asm/page.h | 1 + arch/arc/include/asm/page.h | 1 + arch/arm/include/asm/page.h | 1 + arch/arm64/Kconfig | 26 ++- arch/arm64/include/asm/assembler.h | 78 ++++++- arch/arm64/include/asm/cpufeature.h | 44 +++- arch/arm64/include/asm/efi.h | 2 +- arch/arm64/include/asm/fixmap.h | 28 ++- arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- arch/arm64/include/asm/kvm_arm.h | 21 +- arch/arm64/include/asm/kvm_hyp.h | 11 + arch/arm64/include/asm/kvm_pgtable.h | 6 +- arch/arm64/include/asm/memory.h | 62 ++++-- arch/arm64/include/asm/page-def.h | 3 +- arch/arm64/include/asm/pgalloc.h | 16 +- arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- arch/arm64/include/asm/pgtable-prot.h | 2 +- arch/arm64/include/asm/pgtable.h | 133 +++++++++--- arch/arm64/include/asm/processor.h | 10 +- arch/arm64/include/asm/sections.h | 1 + arch/arm64/include/asm/smp.h | 1 + arch/arm64/include/asm/sparsemem.h | 15 +- arch/arm64/include/asm/sysreg.h | 54 +++-- arch/arm64/include/asm/tlb.h | 3 + arch/arm64/kernel/asm-offsets.c | 4 +- arch/arm64/kernel/cpufeature.c | 93 ++++++-- arch/arm64/kernel/efi.c | 2 +- arch/arm64/kernel/entry.S | 60 +++++- arch/arm64/kernel/head.S | 46 +++- arch/arm64/kernel/hibernate-asm.S | 6 +- arch/arm64/kernel/image-vars.h | 14 ++ arch/arm64/kernel/image.h | 4 + arch/arm64/kernel/pi/idreg-override.c | 68 +++++- arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- arch/arm64/kernel/pi/pi.h | 63 +++++- arch/arm64/kernel/relocate_kernel.S | 10 +- arch/arm64/kernel/vdso-wrap.S | 4 +- arch/arm64/kernel/vdso.c | 7 +- arch/arm64/kernel/vdso/vdso.lds.S | 4 +- arch/arm64/kernel/vdso32-wrap.S | 4 +- arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- arch/arm64/kernel/vmlinux.lds.S | 48 +++-- arch/arm64/kvm/arm.c | 10 + arch/arm64/kvm/hyp/nvhe/Makefile | 1 + arch/arm64/kvm/hyp/nvhe/host.S | 10 +- arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ arch/arm64/kvm/mmu.c | 39 ++-- arch/arm64/lib/clear_page.S | 7 +- arch/arm64/lib/copy_page.S | 33 ++- arch/arm64/lib/mte.S | 27 ++- arch/arm64/mm/Makefile | 1 + arch/arm64/mm/fixmap.c | 38 ++-- arch/arm64/mm/hugetlbpage.c | 40 +--- arch/arm64/mm/init.c | 26 +-- arch/arm64/mm/kasan_init.c | 8 +- arch/arm64/mm/mmu.c | 53 +++-- arch/arm64/mm/pgd.c | 12 +- arch/arm64/mm/pgtable-geometry.c | 24 +++ arch/arm64/mm/proc.S | 128 ++++++++--- arch/arm64/mm/ptdump.c | 3 +- arch/arm64/tools/cpucaps | 3 + arch/csky/include/asm/page.h | 3 + arch/hexagon/include/asm/page.h | 2 + arch/loongarch/include/asm/page.h | 2 + arch/m68k/include/asm/page.h | 1 + arch/microblaze/include/asm/page.h | 1 + arch/mips/include/asm/page.h | 1 + arch/nios2/include/asm/page.h | 2 + arch/openrisc/include/asm/page.h | 1 + arch/parisc/include/asm/page.h | 1 + arch/powerpc/include/asm/page.h | 2 + arch/riscv/include/asm/page.h | 1 + arch/s390/include/asm/page.h | 1 + arch/sh/include/asm/page.h | 1 + arch/sparc/include/asm/page.h | 3 + arch/um/include/asm/page.h | 2 + arch/x86/include/asm/page_types.h | 2 + arch/xtensa/include/asm/page.h | 1 + crypto/lskcipher.c | 4 +- drivers/ata/sata_sil24.c | 46 ++-- drivers/base/node.c | 6 +- drivers/base/topology.c | 32 +-- drivers/block/virtio_blk.c | 2 +- drivers/char/random.c | 4 +- drivers/edac/edac_mc.h | 13 +- drivers/firmware/efi/libstub/arm64.c | 3 +- drivers/irqchip/irq-gic-v3-its.c | 2 +- drivers/mtd/mtdswap.c | 4 +- drivers/net/ethernet/freescale/fec.h | 3 +- drivers/net/ethernet/freescale/fec_main.c | 5 +- .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- drivers/net/ethernet/intel/igb/igb.h | 25 +-- drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- drivers/net/ethernet/marvell/mvneta.c | 9 +- drivers/net/ethernet/marvell/sky2.h | 2 +- drivers/tee/optee/call.c | 7 +- drivers/tee/optee/smc_abi.c | 2 +- drivers/virtio/virtio_balloon.c | 10 +- drivers/xen/balloon.c | 11 +- drivers/xen/biomerge.c | 12 +- drivers/xen/privcmd.c | 2 +- drivers/xen/xenbus/xenbus_client.c | 5 +- drivers/xen/xlate_mmu.c | 6 +- fs/binfmt_elf.c | 11 +- fs/buffer.c | 2 +- fs/coredump.c | 8 +- fs/ext4/ext4.h | 36 ++-- fs/ext4/move_extent.c | 2 +- fs/ext4/readpage.c | 2 +- fs/fat/dir.c | 4 +- fs/fat/fatent.c | 4 +- fs/nfs/nfs42proc.c | 2 +- fs/nfs/nfs42xattr.c | 2 +- fs/nfs/nfs4proc.c | 2 +- include/asm-generic/pgtable-geometry.h | 71 +++++++ include/asm-generic/vmlinux.lds.h | 38 ++-- include/linux/buffer_head.h | 1 + include/linux/cpumask.h | 5 + include/linux/linkage.h | 4 +- include/linux/mm.h | 17 +- include/linux/mm_types.h | 15 +- include/linux/mm_types_task.h | 2 +- include/linux/mmzone.h | 3 +- include/linux/netlink.h | 6 +- include/linux/percpu-defs.h | 4 +- include/linux/perf_event.h | 2 +- include/linux/sched.h | 4 +- include/linux/slab.h | 7 +- include/linux/stackdepot.h | 6 +- include/linux/sunrpc/svc.h | 8 +- include/linux/sunrpc/svc_rdma.h | 4 +- include/linux/sunrpc/svcsock.h | 2 +- include/linux/swap.h | 17 +- include/linux/swapops.h | 6 +- include/linux/thread_info.h | 10 +- include/xen/page.h | 2 + init/main.c | 7 +- kernel/bpf/core.c | 9 +- kernel/bpf/ringbuf.c | 54 ++--- kernel/cgroup/cgroup.c | 8 +- kernel/crash_core.c | 2 +- kernel/events/core.c | 2 +- kernel/fork.c | 71 +++---- kernel/power/power.h | 2 +- kernel/power/snapshot.c | 2 +- kernel/power/swap.c | 129 +++++++++-- kernel/trace/fgraph.c | 2 +- kernel/trace/trace.c | 2 +- lib/stackdepot.c | 6 +- mm/kasan/report.c | 3 +- mm/memcontrol.c | 11 +- mm/memory.c | 4 +- mm/mmap.c | 2 +- mm/page-writeback.c | 2 +- mm/page_alloc.c | 31 +-- mm/slub.c | 2 +- mm/sparse.c | 2 +- mm/swapfile.c | 2 +- mm/vmalloc.c | 7 +- net/9p/trans_virtio.c | 4 +- net/core/hotdata.c | 4 +- net/core/skbuff.c | 4 +- net/core/sysctl_net_core.c | 2 +- net/sunrpc/cache.c | 3 +- net/unix/af_unix.c | 2 +- sound/soc/soc-utils.c | 4 +- virt/kvm/kvm_main.c | 2 +- 172 files changed, 2185 insertions(+), 951 deletions(-) create mode 100644 arch/arm64/include/asm/pgtable-geometry.h create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c create mode 100644 arch/arm64/mm/pgtable-geometry.c create mode 100644 include/asm-generic/pgtable-geometry.h
Hi All, Patch bomb incoming... This covers many subsystems, so I've included a core set of people on the full series and additionally included maintainers on relevant patches. I haven't included those maintainers on this cover letter since the numbers were far too big for it to work. But I've included a link to this cover letter on each patch, so they can hopefully find their way here. For follow up submissions I'll break it up by subsystem, but for now thought it was important to show the full picture. This RFC series implements support for boot-time page size selection within the arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to date, page size has been selected at compile-time, meaning the size is baked into a given kernel image. As use of larger-than-4K page sizes become more prevalent this starts to present a problem for distributions. Boot-time page size selection enables the creation of a single kernel image, which can be told which page size to use on the kernel command line. Why is having an image-per-page size problematic? ================================================= Many traditional distros are now supporting both 4K and 64K. And this means managing 2 kernel packages, along with drivers for each. For some, it means multiple installer flavours and multiple ISOs. All of this adds up to a less-than-ideal level of complexity. Additionally, Android now supports 4K and 16K kernels. I'm told having to explicitly manage their KABI for each kernel is painful, and the extra flash space required for both kernel images and the duplicated modules has been problematic. Boot-time page size selection solves all of this. Additionally, in starting to think about the longer term deployment story for D128 page tables, which Arm architecture now supports, a lot of the same problems need to be solved, so this work sets us up nicely for that. So what's the down side? ======================== Well nothing's free; Various static allocations in the kernel image must be sized for the worst case (largest supported page size), so image size is in line with size of 64K compile-time image. So if you're interested in 4K or 16K, there is a slight increase to the image size. But I expect that problem goes away if you're compressing the image - its just some extra zeros. At boot-time, I expect we could free the unused static storage once we know the page size - although that would be a follow up enhancement. And then there is performance. Since PAGE_SIZE and friends are no longer compile-time constants, we must look up their values and do arithmetic at runtime instead of compile-time. My early perf testing suggests this is inperceptible for real-world workloads, and only has small impact on microbenchmarks - more on this below. Approach ======== The basic idea is to rid the source of any assumptions that PAGE_SIZE and friends are compile-time constant, but in a way that allows the compiler to perform the same optimizations as was previously being done if they do turn out to be compile-time constant. Where constants are required, we use limits; PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full description of all the classes of problems to solve. By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. arm64 does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE Kconfig, which is an alternative to selecting a compile-time page size. When boot-time page size is active, the arch pgtable geometry macro definitions resolve to something that can be configured at boot. The arm64 implementation in this series mainly uses global, __ro_after_init variables. I've tried using alternatives patching, but that performs worse than loading from memory; I think due to code size bloat. Status ====== When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only implemented enough to compile the kernel image itself with defconfig (and a few other bits and pieces). This is enough to build a kernel that can boot under QEMU or FVP. I'll happily do the rest of the work to enable all the extra drivers, but wanted to get feedback on the shape of this effort first. If anyone wants to do any testing, and has a must-have config, let me know and I'll prioritize enabling it first. The series is arranged as follows: - patch 1: Add macros required for converting non-arch code to support boot-time page size selection - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from all non-arch code - patches 37-38: Some arm64 tidy ups - patch 39: Add macros required for converting arm64 code to support boot-time page size selection - patches 40-56: arm64 changes to support boot-time page size selection - patch 57: Add arm64 Kconfig option to enable boot-time page size selection Ideally, I'd like to get the basics merged (something like this series), then incrementally improve it over a handful of kernel releases until we can demonstrate that we have feature parity with the compile-time build and no performance blockers. Once at that point, ideally the compile-time build options would be removed and the code could be cleaned up further. One of the bigger peices that I'd propose to add as a follow up, is to make va-size boot-time selectable too. That will greatly simplify LPA2 fallback handling. Assuming people are ammenable to the rough shape, how would I go about getting the non-arch changes merged? Since they cover many subsystems, will each piece need to go independently to each relevant maintainer or could it all be merged together through the arm64 tree? Image Size ========== The below shows the size of a defconfig (+ xfs, squashfs, ftrace, kprobes) kernel image on disk for base (before any changes applied), compile (with changes, configured for compile-time page size) and boot (with changes, configured for boot-time page size). You can see the that compile-16k and 64k configs are actually slightly smaller than the baselines; that's due to optimizing some buffer sizes which didn't need to depend on page size during the series. The boot-time image is ~1% bigger than the 64k compile-time image. I believe there is scope to improve this to make it equal to compile-64k if required: | config | size/KB | diff/KB | diff/% | |-------------|---------|---------|---------| | base-4k | 54895 | 0 | 0.0% | | base-16k | 55161 | 266 | 0.5% | | base-64k | 56775 | 1880 | 3.4% | | compile-4k | 54895 | 0 | 0.0% | | compile-16k | 55097 | 202 | 0.4% | | compile-64k | 56391 | 1496 | 2.7% | | boot-4K | 57045 | 2150 | 3.9% | And below shows the size of the image in memory at run-time, separated for text and data costs. The boot image has ~1% text cost; most likely due to the fact that PAGE_SIZE and friends are not compile-time constants so need instructions to load the values and do arithmetic. I believe we could eventually get the data cost to match the cost for the compile image for the chosen page size by freeing the ends of the static buffers not needed for the selected page size: | | text | text | text | data | data | data | | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | diff/% | |-------------|---------|---------|---------|---------|---------|---------| | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | 9.5% | | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | Functional Testing ================== I've build-tested defconfig for all arches supported by tuxmake (which is most) without issue. I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all page sizes and a few va-sizes, and additionally have run all the mm-selftests, with no regressions observed vs the equivalent compile-time page size build (although the mm-selftests have a few existing failures when run against 16K and 64K kernels - those should really be investigated and fixed independently). Test coverage is lacking for many of the drivers that I've touched, but in many cases, I'm hoping the changes are simple enough that review might suffice? Performance Testing =================== I've run some limited performance benchmarks: First, a real-world benchmark that causes a lot of page table manipulation (and therefore we would expect to see regression here if we are going to see it anywhere); kernel compilation. It barely registers a change. Values are times, so smaller is better. All relative to base-4k: | | kern | kern | user | user | real | real | | config | mean | stdev | mean | stdev | mean | stdev | |-------------|---------|---------|---------|---------|---------|---------| | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | The Speedometer JavaScript benchmark also shows no change. Values are runs per min, so bigger is better. All relative to base-4k: | config | mean | stdev | |-------------|---------|---------| | base-4k | 0.0% | 0.8% | | compile-4k | 0.4% | 0.8% | | boot-4k | 0.0% | 0.9% | Finally, I've run some microbenchmarks known to stress page table manipulations (originally from David Hildenbrand). The fork test maps/allocs 1G of anon memory, then measures the cost of fork(). The munmap test maps/allocs 1G of anon memory then measures the cost of munmap()ing it. The fork test is known to be extremely sensitive to any changes that cause instructions to be aligned differently in cachelines. When using this test for other changes, I've seen double digit regressions for the slightest thing, so 12% regression on this test is actually fairly good. This likely represents the extreme worst case for regressions that will be observed across other microbenchmarks (famous last words). Values are times, so smaller is better. All relative to base-4k: | | fork | fork | munmap | munmap | | config | mean | stdev | stdev | stdev | |-------------|---------|---------|---------|---------| | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | NOTE: The series applies on top of v6.11. Thanks, Ryan Ryan Roberts (57): mm: Add macros ahead of supporting boot-time page size selection vmlinux: Align to PAGE_SIZE_MAX mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is large mm/page_alloc: Make page_frag_cache boot-time page size compatible mm: Avoid split pmd ptl if pmd level is run-time folded mm: Remove PAGE_SIZE compile-time constant assumption fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing fs: Remove PAGE_SIZE compile-time constant assumption fs/nfs: Remove PAGE_SIZE compile-time constant assumption fs/ext4: Remove PAGE_SIZE compile-time constant assumption fork: Permit boot-time THREAD_SIZE determination cgroup: Remove PAGE_SIZE compile-time constant assumption bpf: Remove PAGE_SIZE compile-time constant assumption pm/hibernate: Remove PAGE_SIZE compile-time constant assumption stackdepot: Remove PAGE_SIZE compile-time constant assumption perf: Remove PAGE_SIZE compile-time constant assumption kvm: Remove PAGE_SIZE compile-time constant assumption trace: Remove PAGE_SIZE compile-time constant assumption crash: Remove PAGE_SIZE compile-time constant assumption crypto: Remove PAGE_SIZE compile-time constant assumption sunrpc: Remove PAGE_SIZE compile-time constant assumption sound: Remove PAGE_SIZE compile-time constant assumption net: Remove PAGE_SIZE compile-time constant assumption net: fec: Remove PAGE_SIZE compile-time constant assumption net: marvell: Remove PAGE_SIZE compile-time constant assumption net: hns3: Remove PAGE_SIZE compile-time constant assumption net: e1000: Remove PAGE_SIZE compile-time constant assumption net: igbvf: Remove PAGE_SIZE compile-time constant assumption net: igb: Remove PAGE_SIZE compile-time constant assumption drivers/base: Remove PAGE_SIZE compile-time constant assumption edac: Remove PAGE_SIZE compile-time constant assumption optee: Remove PAGE_SIZE compile-time constant assumption random: Remove PAGE_SIZE compile-time constant assumption sata_sil24: Remove PAGE_SIZE compile-time constant assumption virtio: Remove PAGE_SIZE compile-time constant assumption xen: Remove PAGE_SIZE compile-time constant assumption arm64: Fix macros to work in C code in addition to the linker script arm64: Track early pgtable allocation limit arm64: Introduce macros required for boot-time page selection arm64: Refactor early pgtable size calculation macros arm64: Pass desired page size on command line arm64: Divorce early init from PAGE_SIZE arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES arm64: Align sections to PAGE_SIZE_MAX arm64: Rework trampoline rodata mapping arm64: Generalize fixmap for boot-time page size arm64: Statically allocate and align for worst-case page size arm64: Convert switch to if for non-const comparison values arm64: Convert BUILD_BUG_ON to VM_BUG_ON arm64: Remove PAGE_SZ asm-offset arm64: Introduce cpu features for page sizes arm64: Remove PAGE_SIZE from assembly code arm64: Runtime-fold pmd level arm64: Support runtime folding in idmap_kpti_install_ng_mappings arm64: TRAMP_VALIAS is no longer compile-time constant arm64: Determine THREAD_SIZE at boot-time arm64: Enable boot-time page size selection arch/alpha/include/asm/page.h | 1 + arch/arc/include/asm/page.h | 1 + arch/arm/include/asm/page.h | 1 + arch/arm64/Kconfig | 26 ++- arch/arm64/include/asm/assembler.h | 78 ++++++- arch/arm64/include/asm/cpufeature.h | 44 +++- arch/arm64/include/asm/efi.h | 2 +- arch/arm64/include/asm/fixmap.h | 28 ++- arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- arch/arm64/include/asm/kvm_arm.h | 21 +- arch/arm64/include/asm/kvm_hyp.h | 11 + arch/arm64/include/asm/kvm_pgtable.h | 6 +- arch/arm64/include/asm/memory.h | 62 ++++-- arch/arm64/include/asm/page-def.h | 3 +- arch/arm64/include/asm/pgalloc.h | 16 +- arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- arch/arm64/include/asm/pgtable-prot.h | 2 +- arch/arm64/include/asm/pgtable.h | 133 +++++++++--- arch/arm64/include/asm/processor.h | 10 +- arch/arm64/include/asm/sections.h | 1 + arch/arm64/include/asm/smp.h | 1 + arch/arm64/include/asm/sparsemem.h | 15 +- arch/arm64/include/asm/sysreg.h | 54 +++-- arch/arm64/include/asm/tlb.h | 3 + arch/arm64/kernel/asm-offsets.c | 4 +- arch/arm64/kernel/cpufeature.c | 93 ++++++-- arch/arm64/kernel/efi.c | 2 +- arch/arm64/kernel/entry.S | 60 +++++- arch/arm64/kernel/head.S | 46 +++- arch/arm64/kernel/hibernate-asm.S | 6 +- arch/arm64/kernel/image-vars.h | 14 ++ arch/arm64/kernel/image.h | 4 + arch/arm64/kernel/pi/idreg-override.c | 68 +++++- arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- arch/arm64/kernel/pi/pi.h | 63 +++++- arch/arm64/kernel/relocate_kernel.S | 10 +- arch/arm64/kernel/vdso-wrap.S | 4 +- arch/arm64/kernel/vdso.c | 7 +- arch/arm64/kernel/vdso/vdso.lds.S | 4 +- arch/arm64/kernel/vdso32-wrap.S | 4 +- arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- arch/arm64/kernel/vmlinux.lds.S | 48 +++-- arch/arm64/kvm/arm.c | 10 + arch/arm64/kvm/hyp/nvhe/Makefile | 1 + arch/arm64/kvm/hyp/nvhe/host.S | 10 +- arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ arch/arm64/kvm/mmu.c | 39 ++-- arch/arm64/lib/clear_page.S | 7 +- arch/arm64/lib/copy_page.S | 33 ++- arch/arm64/lib/mte.S | 27 ++- arch/arm64/mm/Makefile | 1 + arch/arm64/mm/fixmap.c | 38 ++-- arch/arm64/mm/hugetlbpage.c | 40 +--- arch/arm64/mm/init.c | 26 +-- arch/arm64/mm/kasan_init.c | 8 +- arch/arm64/mm/mmu.c | 53 +++-- arch/arm64/mm/pgd.c | 12 +- arch/arm64/mm/pgtable-geometry.c | 24 +++ arch/arm64/mm/proc.S | 128 ++++++++--- arch/arm64/mm/ptdump.c | 3 +- arch/arm64/tools/cpucaps | 3 + arch/csky/include/asm/page.h | 3 + arch/hexagon/include/asm/page.h | 2 + arch/loongarch/include/asm/page.h | 2 + arch/m68k/include/asm/page.h | 1 + arch/microblaze/include/asm/page.h | 1 + arch/mips/include/asm/page.h | 1 + arch/nios2/include/asm/page.h | 2 + arch/openrisc/include/asm/page.h | 1 + arch/parisc/include/asm/page.h | 1 + arch/powerpc/include/asm/page.h | 2 + arch/riscv/include/asm/page.h | 1 + arch/s390/include/asm/page.h | 1 + arch/sh/include/asm/page.h | 1 + arch/sparc/include/asm/page.h | 3 + arch/um/include/asm/page.h | 2 + arch/x86/include/asm/page_types.h | 2 + arch/xtensa/include/asm/page.h | 1 + crypto/lskcipher.c | 4 +- drivers/ata/sata_sil24.c | 46 ++-- drivers/base/node.c | 6 +- drivers/base/topology.c | 32 +-- drivers/block/virtio_blk.c | 2 +- drivers/char/random.c | 4 +- drivers/edac/edac_mc.h | 13 +- drivers/firmware/efi/libstub/arm64.c | 3 +- drivers/irqchip/irq-gic-v3-its.c | 2 +- drivers/mtd/mtdswap.c | 4 +- drivers/net/ethernet/freescale/fec.h | 3 +- drivers/net/ethernet/freescale/fec_main.c | 5 +- .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- drivers/net/ethernet/intel/igb/igb.h | 25 +-- drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- drivers/net/ethernet/marvell/mvneta.c | 9 +- drivers/net/ethernet/marvell/sky2.h | 2 +- drivers/tee/optee/call.c | 7 +- drivers/tee/optee/smc_abi.c | 2 +- drivers/virtio/virtio_balloon.c | 10 +- drivers/xen/balloon.c | 11 +- drivers/xen/biomerge.c | 12 +- drivers/xen/privcmd.c | 2 +- drivers/xen/xenbus/xenbus_client.c | 5 +- drivers/xen/xlate_mmu.c | 6 +- fs/binfmt_elf.c | 11 +- fs/buffer.c | 2 +- fs/coredump.c | 8 +- fs/ext4/ext4.h | 36 ++-- fs/ext4/move_extent.c | 2 +- fs/ext4/readpage.c | 2 +- fs/fat/dir.c | 4 +- fs/fat/fatent.c | 4 +- fs/nfs/nfs42proc.c | 2 +- fs/nfs/nfs42xattr.c | 2 +- fs/nfs/nfs4proc.c | 2 +- include/asm-generic/pgtable-geometry.h | 71 +++++++ include/asm-generic/vmlinux.lds.h | 38 ++-- include/linux/buffer_head.h | 1 + include/linux/cpumask.h | 5 + include/linux/linkage.h | 4 +- include/linux/mm.h | 17 +- include/linux/mm_types.h | 15 +- include/linux/mm_types_task.h | 2 +- include/linux/mmzone.h | 3 +- include/linux/netlink.h | 6 +- include/linux/percpu-defs.h | 4 +- include/linux/perf_event.h | 2 +- include/linux/sched.h | 4 +- include/linux/slab.h | 7 +- include/linux/stackdepot.h | 6 +- include/linux/sunrpc/svc.h | 8 +- include/linux/sunrpc/svc_rdma.h | 4 +- include/linux/sunrpc/svcsock.h | 2 +- include/linux/swap.h | 17 +- include/linux/swapops.h | 6 +- include/linux/thread_info.h | 10 +- include/xen/page.h | 2 + init/main.c | 7 +- kernel/bpf/core.c | 9 +- kernel/bpf/ringbuf.c | 54 ++--- kernel/cgroup/cgroup.c | 8 +- kernel/crash_core.c | 2 +- kernel/events/core.c | 2 +- kernel/fork.c | 71 +++---- kernel/power/power.h | 2 +- kernel/power/snapshot.c | 2 +- kernel/power/swap.c | 129 +++++++++-- kernel/trace/fgraph.c | 2 +- kernel/trace/trace.c | 2 +- lib/stackdepot.c | 6 +- mm/kasan/report.c | 3 +- mm/memcontrol.c | 11 +- mm/memory.c | 4 +- mm/mmap.c | 2 +- mm/page-writeback.c | 2 +- mm/page_alloc.c | 31 +-- mm/slub.c | 2 +- mm/sparse.c | 2 +- mm/swapfile.c | 2 +- mm/vmalloc.c | 7 +- net/9p/trans_virtio.c | 4 +- net/core/hotdata.c | 4 +- net/core/skbuff.c | 4 +- net/core/sysctl_net_core.c | 2 +- net/sunrpc/cache.c | 3 +- net/unix/af_unix.c | 2 +- sound/soc/soc-utils.c | 4 +- virt/kvm/kvm_main.c | 2 +- 172 files changed, 2185 insertions(+), 951 deletions(-) create mode 100644 arch/arm64/include/asm/pgtable-geometry.h create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c create mode 100644 arch/arm64/mm/pgtable-geometry.c create mode 100644 include/asm-generic/pgtable-geometry.h -- 2.43.0
Hi Ryan, On Mon, Oct 14, 2024 at 11:55:11AM +0100, Ryan Roberts wrote: > This RFC series implements support for boot-time page size selection within the > arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to date, page > size has been selected at compile-time, meaning the size is baked into a given > kernel image. As use of larger-than-4K page sizes become more prevalent this > starts to present a problem for distributions. Boot-time page size selection > enables the creation of a single kernel image, which can be told which page size > to use on the kernel command line. That's great work, something I wasn't expecting to even build, let alone run ;). I only looked briefly through the patches, there's probably room for optimisation of micro-benchmarks like fork(), maybe using something like runtime constants. The advantage for deployment and easy testing of different configurations is pretty clear (distros mainly, not sure how relevant it is for Android if apps can't move beyond 4K pages). However, as a maintainer, my main concern is having to chase build failures in obscure drivers that have not been tested/developed on arm64. If people primarily test on x86, they wouldn't notice that PAGE_SIZE/PAGE_SHIFT are no longer constants. Not looking forward to trying to sort out allmodconfig builds every kernel release, especially if they turn up in subsystems I have no clue about (like most stuff outside arch/arm64). So, first of all, I'd like to understand the overall maintainability impact better. I assume you tested mostly defconfig. If you run an allmodconfig build with make -k, how many build failures do you get with this patchset? Similarly for some distro configs. Do we have any better way to detect this other than actual compilation on arm64? Can we hack something around COMPILE_TEST like redefine PAGE_SIZE (for modules only) to a variable so that we have a better chance of detecting build failures when modules are only tested on other architectures? At the moment, I'm not entirely convinced of the benefits vs. long term maintainability. Even if we don't end up merging the dynamic PAGE_SIZE support, parts of this series are needed for supporting 128-bit ptes on arm64, hopefully dynamically as well. Thanks. -- Catalin
On 31/10/2024 21:07, Catalin Marinas wrote: > Hi Ryan, > > On Mon, Oct 14, 2024 at 11:55:11AM +0100, Ryan Roberts wrote: >> This RFC series implements support for boot-time page size selection within the >> arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to date, page >> size has been selected at compile-time, meaning the size is baked into a given >> kernel image. As use of larger-than-4K page sizes become more prevalent this >> starts to present a problem for distributions. Boot-time page size selection >> enables the creation of a single kernel image, which can be told which page size >> to use on the kernel command line. > > That's great work, something I wasn't expecting to even build, let alone > run ;). Cheers! > I only looked briefly through the patches, there's probably room > for optimisation of micro-benchmarks like fork(), maybe using something > like runtime constants. Yes I suspect there is room for some optimization. Although note I already tried using alternatives patching but for the fork() microbenchmark this performed worse than the approach I ended up taking of just loading a global variable. I think this was likely due to code layout changes due to all the extra branches/nops - fork has been very sensitive to code layout changes in the past. > The advantage for deployment and easy testing of > different configurations is pretty clear (distros mainly, not sure how > relevant it is for Android if apps can't move beyond 4K pages). > > However, as a maintainer, my main concern is having to chase build > failures in obscure drivers that have not been tested/developed on > arm64. If people primarily test on x86, they wouldn't notice that > PAGE_SIZE/PAGE_SHIFT are no longer constants. Not looking forward to > trying to sort out allmodconfig builds every kernel release, especially > if they turn up in subsystems I have no clue about (like most stuff > outside arch/arm64). Yes, I understand that concern. > > So, first of all, I'd like to understand the overall maintainability > impact better. I assume you tested mostly defconfig. If you run an > allmodconfig build with make -k, how many build failures do you get with > this patchset? Similarly for some distro configs. I've roughly done: make alldefconfig && ./scripts/config --enable CONFIG_ARM64_BOOT_TIME_PAGE_SIZE && make -s -j`nproc` -k &> allmodconfig.log Then parsed the log for issues. Unfortunately the errors are very chatty and it is difficult to perfectly extract stats. If I search for r'(\S+\.[ch]):.*error:', that is optimistic because PAGE_SIZE being non-const gets the ultimate blame for most things, but I'm interested in the call sites. Number of affected files using this approach: 111. If I just blindly search for all files, r'(\S+\.[ch]):', that is pessimistic because when the issue is in a header, the full include chain is spat out. Number of affected files using this approach: 1807. If I just search for C files, r'(\S+\.[c]):', (all issues in headers terminate in a C file) that is also pessimistic because the same single header issue is reported for every C file it is included in. Number of affected files using this approach: 1369. In the end, I decided to go for r'(\S+\.[ch]):.*(error|note):', which is any files described as having an error or being the callsite of the thing with the error. I think this is likely most accurate from eyeballing the log: | | C&H files | percentage of | | directory | w/ error | all C&H files | |------------|---------------|---------------| | arch/arm64 | 7 | 1.3% | | drivers | 127 | 0.4% | | fs | 25 | 1.1% | | include | 27 | 0.4% | | init | 1 | 8.3% | | kernel | 7 | 1.3% | | lib | 1 | 0.2% | | mm | 6 | 3.2% | | net | 7 | 0.4% | | security | 2 | 0.8% | | sound | 21 | 0.8% | |------------|---------------|---------------| | TOTAL | 231 | 0.4% | |------------|---------------|---------------| I'm not sure how best to evaluate if this is a large or small number though! For comparison, the RFC modified 172 files. > > Do we have any better way to detect this other than actual compilation > on arm64? Can we hack something around COMPILE_TEST like redefine > PAGE_SIZE (for modules only) to a variable so that we have a better > chance of detecting build failures when modules are only tested on other > architectures? I can certainly look into this. But if the concern is that drivers are not being compiled against arm64, what is the likelyhood of them being compiled against COMPILE_TEST? > > At the moment, I'm not entirely convinced of the benefits vs. long term > maintainability. Even if we don't end up merging the dynamic PAGE_SIZE > support, parts of this series are needed for supporting 128-bit ptes on > arm64, hopefully dynamically as well. Agreed. Thanks, Ryan > > Thanks. >
On Wed, Nov 06, 2024 at 11:37:58AM +0000, Ryan Roberts wrote: > On 31/10/2024 21:07, Catalin Marinas wrote: > > So, first of all, I'd like to understand the overall maintainability > > impact better. I assume you tested mostly defconfig. If you run an > > allmodconfig build with make -k, how many build failures do you get with > > this patchset? Similarly for some distro configs. > > I've roughly done: > > make alldefconfig && > ./scripts/config --enable CONFIG_ARM64_BOOT_TIME_PAGE_SIZE && > make -s -j`nproc` -k &> allmodconfig.log Is it alldefconfig or allmodconfig? The former has a lot less symbols enabled than even defconfig (fairly close to allnoconfig actually): $ make defconfig $ grep -v "^#\|^$" .config | wc -l 4449 $ make alldefconfig $ grep -v "^#\|^$" .config | wc -l 713 $ make allmodconfig $ grep -v "^#\|^$" .config | wc -l 14401 > In the end, I decided to go for r'(\S+\.[ch]):.*(error|note):', which is any > files described as having an error or being the callsite of the thing with the > error. I think this is likely most accurate from eyeballing the log: I think that's good enough to give us a rough idea. > | | C&H files | percentage of | > | directory | w/ error | all C&H files | > |------------|---------------|---------------| > | arch/arm64 | 7 | 1.3% | > | drivers | 127 | 0.4% | > | fs | 25 | 1.1% | > | include | 27 | 0.4% | > | init | 1 | 8.3% | > | kernel | 7 | 1.3% | > | lib | 1 | 0.2% | > | mm | 6 | 3.2% | > | net | 7 | 0.4% | > | security | 2 | 0.8% | > | sound | 21 | 0.8% | > |------------|---------------|---------------| > | TOTAL | 231 | 0.4% | > |------------|---------------|---------------| This doesn't look that bad _if_ you actually built most modules. But if it was alldefconfig, you likely missed the majority of modules. > > Do we have any better way to detect this other than actual compilation > > on arm64? Can we hack something around COMPILE_TEST like redefine > > PAGE_SIZE (for modules only) to a variable so that we have a better > > chance of detecting build failures when modules are only tested on other > > architectures? > > I can certainly look into this. But if the concern is that drivers are not being > compiled against arm64, what is the likelyhood of them being compiled against > COMPILE_TEST? Hopefully some CIs out there catching them. Well, if we are to fix them anyway, we might as well eventually force a non-const PAGE_SIZE generically even if it returns a constant. I'm building allmod now with something like below (and some hacks in arch and core code to use STATIC_PAGE_* as I did not apply your patches). alldefconfig passes with my hacks but, as you can see, the non-const PAGE_SIZE kicks in only if MODULE is defined. So, not an accurate test, just to get a feel of the modules problem. ----------8<--------------------------- diff --git a/arch/arm64/include/asm/page-def.h b/arch/arm64/include/asm/page-def.h index 792e9fe881dc..71a761f86b15 100644 --- a/arch/arm64/include/asm/page-def.h +++ b/arch/arm64/include/asm/page-def.h @@ -12,7 +12,19 @@ /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT CONFIG_PAGE_SHIFT -#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) +#define STATIC_PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) +#define STATIC_PAGE_MASK (~(STATIC_PAGE_SIZE-1)) + +#if !defined(MODULE) || defined(__ASSEMBLY__) +#define PAGE_SIZE STATIC_PAGE_SIZE +#else +static inline unsigned long __runtime_page_size(void) +{ + return 1UL << PAGE_SHIFT; +} +#define PAGE_SIZE (__runtime_page_size()) +#endif + #define PAGE_MASK (~(PAGE_SIZE-1)) #endif /* __ASM_PAGE_DEF_H */ ----------8<--------------------------- -- Catalin
On 07/11/2024 12:35, Catalin Marinas wrote: > On Wed, Nov 06, 2024 at 11:37:58AM +0000, Ryan Roberts wrote: >> On 31/10/2024 21:07, Catalin Marinas wrote: >>> So, first of all, I'd like to understand the overall maintainability >>> impact better. I assume you tested mostly defconfig. If you run an >>> allmodconfig build with make -k, how many build failures do you get with >>> this patchset? Similarly for some distro configs. >> >> I've roughly done: >> >> make alldefconfig && >> ./scripts/config --enable CONFIG_ARM64_BOOT_TIME_PAGE_SIZE && >> make -s -j`nproc` -k &> allmodconfig.log > > Is it alldefconfig or allmodconfig? The former has a lot less symbols > enabled than even defconfig (fairly close to allnoconfig actually): Eek, that was a typo when I wrote the email... I built allmodconfig - the big one. > > $ make defconfig > $ grep -v "^#\|^$" .config | wc -l > 4449 > > $ make alldefconfig > $ grep -v "^#\|^$" .config | wc -l > 713 > > $ make allmodconfig > $ grep -v "^#\|^$" .config | wc -l > 14401 > >> In the end, I decided to go for r'(\S+\.[ch]):.*(error|note):', which is any >> files described as having an error or being the callsite of the thing with the >> error. I think this is likely most accurate from eyeballing the log: > > I think that's good enough to give us a rough idea. > >> | | C&H files | percentage of | >> | directory | w/ error | all C&H files | >> |------------|---------------|---------------| >> | arch/arm64 | 7 | 1.3% | >> | drivers | 127 | 0.4% | >> | fs | 25 | 1.1% | >> | include | 27 | 0.4% | >> | init | 1 | 8.3% | >> | kernel | 7 | 1.3% | >> | lib | 1 | 0.2% | >> | mm | 6 | 3.2% | >> | net | 7 | 0.4% | >> | security | 2 | 0.8% | >> | sound | 21 | 0.8% | >> |------------|---------------|---------------| >> | TOTAL | 231 | 0.4% | >> |------------|---------------|---------------| > > This doesn't look that bad _if_ you actually built most modules. But if > it was alldefconfig, you likely missed the majority of modules. I definitely built allmodconfig, so I guess "this doesn't look bad" :) > >>> Do we have any better way to detect this other than actual compilation >>> on arm64? Can we hack something around COMPILE_TEST like redefine >>> PAGE_SIZE (for modules only) to a variable so that we have a better >>> chance of detecting build failures when modules are only tested on other >>> architectures? >> >> I can certainly look into this. But if the concern is that drivers are not being >> compiled against arm64, what is the likelyhood of them being compiled against >> COMPILE_TEST? > > Hopefully some CIs out there catching them. Well, if we are to fix them > anyway, we might as well eventually force a non-const PAGE_SIZE > generically even if it returns a constant. > > I'm building allmod now with something like below (and some hacks in > arch and core code to use STATIC_PAGE_* as I did not apply your > patches). alldefconfig passes with my hacks but, as you can see, the > non-const PAGE_SIZE kicks in only if MODULE is defined. So, not an > accurate test, just to get a feel of the modules problem. Nice. I guess that's pretty much the change we would add for x86 with COMPILE_TEST. > > ----------8<--------------------------- > diff --git a/arch/arm64/include/asm/page-def.h b/arch/arm64/include/asm/page-def.h > index 792e9fe881dc..71a761f86b15 100644 > --- a/arch/arm64/include/asm/page-def.h > +++ b/arch/arm64/include/asm/page-def.h > @@ -12,7 +12,19 @@ > > /* PAGE_SHIFT determines the page size */ > #define PAGE_SHIFT CONFIG_PAGE_SHIFT > -#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) > +#define STATIC_PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) > +#define STATIC_PAGE_MASK (~(STATIC_PAGE_SIZE-1)) > + > +#if !defined(MODULE) || defined(__ASSEMBLY__) > +#define PAGE_SIZE STATIC_PAGE_SIZE > +#else > +static inline unsigned long __runtime_page_size(void) > +{ > + return 1UL << PAGE_SHIFT; > +} > +#define PAGE_SIZE (__runtime_page_size()) > +#endif > + > #define PAGE_MASK (~(PAGE_SIZE-1)) > > #endif /* __ASM_PAGE_DEF_H */ > ----------8<--------------------------- >
On Monday, October 14, 2024 6:55:11 AM EDT Ryan Roberts wrote: > Hi All, > > Patch bomb incoming... This covers many subsystems, so I've included a core > set of people on the full series and additionally included maintainers on > relevant patches. I haven't included those maintainers on this cover letter > since the numbers were far too big for it to work. But I've included a link > to this cover letter on each patch, so they can hopefully find their way > here. For follow up submissions I'll break it up by subsystem, but for now > thought it was important to show the full picture. > > This RFC series implements support for boot-time page size selection within > the arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to > date, page size has been selected at compile-time, meaning the size is > baked into a given kernel image. As use of larger-than-4K page sizes become > more prevalent this starts to present a problem for distributions. > Boot-time page size selection enables the creation of a single kernel > image, which can be told which page size to use on the kernel command line. > > Why is having an image-per-page size problematic? > ================================================= > > Many traditional distros are now supporting both 4K and 64K. And this means > managing 2 kernel packages, along with drivers for each. For some, it means > multiple installer flavours and multiple ISOs. All of this adds up to a > less-than-ideal level of complexity. Additionally, Android now supports 4K > and 16K kernels. I'm told having to explicitly manage their KABI for each > kernel is painful, and the extra flash space required for both kernel > images and the duplicated modules has been problematic. Boot-time page size > selection solves all of this. > > Additionally, in starting to think about the longer term deployment story > for D128 page tables, which Arm architecture now supports, a lot of the > same problems need to be solved, so this work sets us up nicely for that. > > So what's the down side? > ======================== > > Well nothing's free; Various static allocations in the kernel image must be > sized for the worst case (largest supported page size), so image size is in > line with size of 64K compile-time image. So if you're interested in 4K or > 16K, there is a slight increase to the image size. But I expect that > problem goes away if you're compressing the image - its just some extra > zeros. At boot-time, I expect we could free the unused static storage once > we know the page size - although that would be a follow up enhancement. > > And then there is performance. Since PAGE_SIZE and friends are no longer > compile-time constants, we must look up their values and do arithmetic at > runtime instead of compile-time. My early perf testing suggests this is > inperceptible for real-world workloads, and only has small impact on > microbenchmarks - more on this below. > > Approach > ======== > > The basic idea is to rid the source of any assumptions that PAGE_SIZE and > friends are compile-time constant, but in a way that allows the compiler to > perform the same optimizations as was previously being done if they do turn > out to be compile-time constant. Where constants are required, we use > limits; PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full > description of all the classes of problems to solve. > > By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to > boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. > arm64 does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE > Kconfig, which is an alternative to selecting a compile-time page size. > > When boot-time page size is active, the arch pgtable geometry macro > definitions resolve to something that can be configured at boot. The arm64 > implementation in this series mainly uses global, __ro_after_init > variables. I've tried using alternatives patching, but that performs worse > than loading from memory; I think due to code size bloat. > > Status > ====== > > When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only implemented > enough to compile the kernel image itself with defconfig (and a few other > bits and pieces). This is enough to build a kernel that can boot under QEMU > or FVP. I'll happily do the rest of the work to enable all the extra > drivers, but wanted to get feedback on the shape of this effort first. If > anyone wants to do any testing, and has a must-have config, let me know and > I'll prioritize enabling it first. > > The series is arranged as follows: > > - patch 1: Add macros required for converting non-arch code to support > boot-time page size selection > - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from > all non-arch code > - patches 37-38: Some arm64 tidy ups > - patch 39: Add macros required for converting arm64 code to support > boot-time page size selection > - patches 40-56: arm64 changes to support boot-time page size selection > - patch 57: Add arm64 Kconfig option to enable boot-time page size > selection > > Ideally, I'd like to get the basics merged (something like this series), > then incrementally improve it over a handful of kernel releases until we > can demonstrate that we have feature parity with the compile-time build and > no performance blockers. Once at that point, ideally the compile-time build > options would be removed and the code could be cleaned up further. > > One of the bigger peices that I'd propose to add as a follow up, is to make > va-size boot-time selectable too. That will greatly simplify LPA2 fallback > handling. > > Assuming people are ammenable to the rough shape, how would I go about > getting the non-arch changes merged? Since they cover many subsystems, will > each piece need to go independently to each relevant maintainer or could it > all be merged together through the arm64 tree? > > Image Size > ========== > > The below shows the size of a defconfig (+ xfs, squashfs, ftrace, kprobes) > kernel image on disk for base (before any changes applied), compile (with > changes, configured for compile-time page size) and boot (with changes, > configured for boot-time page size). > > You can see the that compile-16k and 64k configs are actually slightly > smaller than the baselines; that's due to optimizing some buffer sizes > which didn't need to depend on page size during the series. The boot-time > image is ~1% bigger than the 64k compile-time image. I believe there is > scope to improve this to make it > equal to compile-64k if required: > | config | size/KB | diff/KB | diff/% | > | > |-------------|---------|---------|---------| > | > | base-4k | 54895 | 0 | 0.0% | > | base-16k | 55161 | 266 | 0.5% | > | base-64k | 56775 | 1880 | 3.4% | > | compile-4k | 54895 | 0 | 0.0% | > | compile-16k | 55097 | 202 | 0.4% | > | compile-64k | 56391 | 1496 | 2.7% | > | boot-4K | 57045 | 2150 | 3.9% | > > And below shows the size of the image in memory at run-time, separated for > text and data costs. The boot image has ~1% text cost; most likely due to > the fact that PAGE_SIZE and friends are not compile-time constants so need > instructions to load the values and do arithmetic. I believe we could > eventually get the data cost to match the cost for the compile image for > the chosen page size by freeing > the ends of the static buffers not needed for the selected page size: > | | text | text | text | data | data | data | > | > | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | diff/% | > | > |-------------|---------|---------|---------|---------|---------|---------| > | > | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | > | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | > | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | 9.5% | > | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | > | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | > | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | > | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | > > Functional Testing > ================== > > I've build-tested defconfig for all arches supported by tuxmake (which is > most) without issue. > > I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all page > sizes and a few va-sizes, and additionally have run all the mm-selftests, > with no regressions observed vs the equivalent compile-time page size build > (although the mm-selftests have a few existing failures when run against > 16K and 64K kernels - those should really be investigated and fixed > independently). > > Test coverage is lacking for many of the drivers that I've touched, but in > many cases, I'm hoping the changes are simple enough that review might > suffice? > > Performance Testing > =================== > > I've run some limited performance benchmarks: > > First, a real-world benchmark that causes a lot of page table manipulation > (and therefore we would expect to see regression here if we are going to > see it anywhere); kernel compilation. It barely registers a change. Values > are times, > so smaller is better. All relative to base-4k: > | | kern | kern | user | user | real | real | > | > | config | mean | stdev | mean | stdev | mean | stdev | > | > |-------------|---------|---------|---------|---------|---------|---------| > | > | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | > | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | > | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | > > The Speedometer JavaScript benchmark also shows no change. Values are runs > per > min, so bigger is better. All relative to base-4k: > | config | mean | stdev | > | > |-------------|---------|---------| > | > | base-4k | 0.0% | 0.8% | > | compile-4k | 0.4% | 0.8% | > | boot-4k | 0.0% | 0.9% | > > Finally, I've run some microbenchmarks known to stress page table > manipulations (originally from David Hildenbrand). The fork test > maps/allocs 1G of anon memory, then measures the cost of fork(). The munmap > test maps/allocs 1G of anon memory then measures the cost of munmap()ing > it. The fork test is known to be extremely sensitive to any changes that > cause instructions to be aligned differently in cachelines. When using this > test for other changes, I've seen double digit regressions for the > slightest thing, so 12% regression on this test is actually fairly good. > This likely represents the extreme worst case for regressions that will be > observed across other microbenchmarks (famous last > words). Values are times, so smaller is better. All relative to base-4k: > | | fork | fork | munmap | munmap | > | > | config | mean | stdev | stdev | stdev | > | > |-------------|---------|---------|---------|---------| > | > | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | > | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | > | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | > > NOTE: The series applies on top of v6.11. > > Thanks, > Ryan > > > Ryan Roberts (57): > mm: Add macros ahead of supporting boot-time page size selection > vmlinux: Align to PAGE_SIZE_MAX > mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is large > mm/page_alloc: Make page_frag_cache boot-time page size compatible > mm: Avoid split pmd ptl if pmd level is run-time folded > mm: Remove PAGE_SIZE compile-time constant assumption > fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing > fs: Remove PAGE_SIZE compile-time constant assumption > fs/nfs: Remove PAGE_SIZE compile-time constant assumption > fs/ext4: Remove PAGE_SIZE compile-time constant assumption > fork: Permit boot-time THREAD_SIZE determination > cgroup: Remove PAGE_SIZE compile-time constant assumption > bpf: Remove PAGE_SIZE compile-time constant assumption > pm/hibernate: Remove PAGE_SIZE compile-time constant assumption > stackdepot: Remove PAGE_SIZE compile-time constant assumption > perf: Remove PAGE_SIZE compile-time constant assumption > kvm: Remove PAGE_SIZE compile-time constant assumption > trace: Remove PAGE_SIZE compile-time constant assumption > crash: Remove PAGE_SIZE compile-time constant assumption > crypto: Remove PAGE_SIZE compile-time constant assumption > sunrpc: Remove PAGE_SIZE compile-time constant assumption > sound: Remove PAGE_SIZE compile-time constant assumption > net: Remove PAGE_SIZE compile-time constant assumption > net: fec: Remove PAGE_SIZE compile-time constant assumption > net: marvell: Remove PAGE_SIZE compile-time constant assumption > net: hns3: Remove PAGE_SIZE compile-time constant assumption > net: e1000: Remove PAGE_SIZE compile-time constant assumption > net: igbvf: Remove PAGE_SIZE compile-time constant assumption > net: igb: Remove PAGE_SIZE compile-time constant assumption > drivers/base: Remove PAGE_SIZE compile-time constant assumption > edac: Remove PAGE_SIZE compile-time constant assumption > optee: Remove PAGE_SIZE compile-time constant assumption > random: Remove PAGE_SIZE compile-time constant assumption > sata_sil24: Remove PAGE_SIZE compile-time constant assumption > virtio: Remove PAGE_SIZE compile-time constant assumption > xen: Remove PAGE_SIZE compile-time constant assumption > arm64: Fix macros to work in C code in addition to the linker script > arm64: Track early pgtable allocation limit > arm64: Introduce macros required for boot-time page selection > arm64: Refactor early pgtable size calculation macros > arm64: Pass desired page size on command line > arm64: Divorce early init from PAGE_SIZE > arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES > arm64: Align sections to PAGE_SIZE_MAX > arm64: Rework trampoline rodata mapping > arm64: Generalize fixmap for boot-time page size > arm64: Statically allocate and align for worst-case page size > arm64: Convert switch to if for non-const comparison values > arm64: Convert BUILD_BUG_ON to VM_BUG_ON > arm64: Remove PAGE_SZ asm-offset > arm64: Introduce cpu features for page sizes > arm64: Remove PAGE_SIZE from assembly code > arm64: Runtime-fold pmd level > arm64: Support runtime folding in idmap_kpti_install_ng_mappings > arm64: TRAMP_VALIAS is no longer compile-time constant > arm64: Determine THREAD_SIZE at boot-time > arm64: Enable boot-time page size selection > > arch/alpha/include/asm/page.h | 1 + > arch/arc/include/asm/page.h | 1 + > arch/arm/include/asm/page.h | 1 + > arch/arm64/Kconfig | 26 ++- > arch/arm64/include/asm/assembler.h | 78 ++++++- > arch/arm64/include/asm/cpufeature.h | 44 +++- > arch/arm64/include/asm/efi.h | 2 +- > arch/arm64/include/asm/fixmap.h | 28 ++- > arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- > arch/arm64/include/asm/kvm_arm.h | 21 +- > arch/arm64/include/asm/kvm_hyp.h | 11 + > arch/arm64/include/asm/kvm_pgtable.h | 6 +- > arch/arm64/include/asm/memory.h | 62 ++++-- > arch/arm64/include/asm/page-def.h | 3 +- > arch/arm64/include/asm/pgalloc.h | 16 +- > arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ > arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- > arch/arm64/include/asm/pgtable-prot.h | 2 +- > arch/arm64/include/asm/pgtable.h | 133 +++++++++--- > arch/arm64/include/asm/processor.h | 10 +- > arch/arm64/include/asm/sections.h | 1 + > arch/arm64/include/asm/smp.h | 1 + > arch/arm64/include/asm/sparsemem.h | 15 +- > arch/arm64/include/asm/sysreg.h | 54 +++-- > arch/arm64/include/asm/tlb.h | 3 + > arch/arm64/kernel/asm-offsets.c | 4 +- > arch/arm64/kernel/cpufeature.c | 93 ++++++-- > arch/arm64/kernel/efi.c | 2 +- > arch/arm64/kernel/entry.S | 60 +++++- > arch/arm64/kernel/head.S | 46 +++- > arch/arm64/kernel/hibernate-asm.S | 6 +- > arch/arm64/kernel/image-vars.h | 14 ++ > arch/arm64/kernel/image.h | 4 + > arch/arm64/kernel/pi/idreg-override.c | 68 +++++- > arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- > arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- > arch/arm64/kernel/pi/pi.h | 63 +++++- > arch/arm64/kernel/relocate_kernel.S | 10 +- > arch/arm64/kernel/vdso-wrap.S | 4 +- > arch/arm64/kernel/vdso.c | 7 +- > arch/arm64/kernel/vdso/vdso.lds.S | 4 +- > arch/arm64/kernel/vdso32-wrap.S | 4 +- > arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- > arch/arm64/kernel/vmlinux.lds.S | 48 +++-- > arch/arm64/kvm/arm.c | 10 + > arch/arm64/kvm/hyp/nvhe/Makefile | 1 + > arch/arm64/kvm/hyp/nvhe/host.S | 10 +- > arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- > arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ > arch/arm64/kvm/mmu.c | 39 ++-- > arch/arm64/lib/clear_page.S | 7 +- > arch/arm64/lib/copy_page.S | 33 ++- > arch/arm64/lib/mte.S | 27 ++- > arch/arm64/mm/Makefile | 1 + > arch/arm64/mm/fixmap.c | 38 ++-- > arch/arm64/mm/hugetlbpage.c | 40 +--- > arch/arm64/mm/init.c | 26 +-- > arch/arm64/mm/kasan_init.c | 8 +- > arch/arm64/mm/mmu.c | 53 +++-- > arch/arm64/mm/pgd.c | 12 +- > arch/arm64/mm/pgtable-geometry.c | 24 +++ > arch/arm64/mm/proc.S | 128 ++++++++--- > arch/arm64/mm/ptdump.c | 3 +- > arch/arm64/tools/cpucaps | 3 + > arch/csky/include/asm/page.h | 3 + > arch/hexagon/include/asm/page.h | 2 + > arch/loongarch/include/asm/page.h | 2 + > arch/m68k/include/asm/page.h | 1 + > arch/microblaze/include/asm/page.h | 1 + > arch/mips/include/asm/page.h | 1 + > arch/nios2/include/asm/page.h | 2 + > arch/openrisc/include/asm/page.h | 1 + > arch/parisc/include/asm/page.h | 1 + > arch/powerpc/include/asm/page.h | 2 + > arch/riscv/include/asm/page.h | 1 + > arch/s390/include/asm/page.h | 1 + > arch/sh/include/asm/page.h | 1 + > arch/sparc/include/asm/page.h | 3 + > arch/um/include/asm/page.h | 2 + > arch/x86/include/asm/page_types.h | 2 + > arch/xtensa/include/asm/page.h | 1 + > crypto/lskcipher.c | 4 +- > drivers/ata/sata_sil24.c | 46 ++-- > drivers/base/node.c | 6 +- > drivers/base/topology.c | 32 +-- > drivers/block/virtio_blk.c | 2 +- > drivers/char/random.c | 4 +- > drivers/edac/edac_mc.h | 13 +- > drivers/firmware/efi/libstub/arm64.c | 3 +- > drivers/irqchip/irq-gic-v3-its.c | 2 +- > drivers/mtd/mtdswap.c | 4 +- > drivers/net/ethernet/freescale/fec.h | 3 +- > drivers/net/ethernet/freescale/fec_main.c | 5 +- > .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- > drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- > drivers/net/ethernet/intel/igb/igb.h | 25 +-- > drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ > drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- > drivers/net/ethernet/marvell/mvneta.c | 9 +- > drivers/net/ethernet/marvell/sky2.h | 2 +- > drivers/tee/optee/call.c | 7 +- > drivers/tee/optee/smc_abi.c | 2 +- > drivers/virtio/virtio_balloon.c | 10 +- > drivers/xen/balloon.c | 11 +- > drivers/xen/biomerge.c | 12 +- > drivers/xen/privcmd.c | 2 +- > drivers/xen/xenbus/xenbus_client.c | 5 +- > drivers/xen/xlate_mmu.c | 6 +- > fs/binfmt_elf.c | 11 +- > fs/buffer.c | 2 +- > fs/coredump.c | 8 +- > fs/ext4/ext4.h | 36 ++-- > fs/ext4/move_extent.c | 2 +- > fs/ext4/readpage.c | 2 +- > fs/fat/dir.c | 4 +- > fs/fat/fatent.c | 4 +- > fs/nfs/nfs42proc.c | 2 +- > fs/nfs/nfs42xattr.c | 2 +- > fs/nfs/nfs4proc.c | 2 +- > include/asm-generic/pgtable-geometry.h | 71 +++++++ > include/asm-generic/vmlinux.lds.h | 38 ++-- > include/linux/buffer_head.h | 1 + > include/linux/cpumask.h | 5 + > include/linux/linkage.h | 4 +- > include/linux/mm.h | 17 +- > include/linux/mm_types.h | 15 +- > include/linux/mm_types_task.h | 2 +- > include/linux/mmzone.h | 3 +- > include/linux/netlink.h | 6 +- > include/linux/percpu-defs.h | 4 +- > include/linux/perf_event.h | 2 +- > include/linux/sched.h | 4 +- > include/linux/slab.h | 7 +- > include/linux/stackdepot.h | 6 +- > include/linux/sunrpc/svc.h | 8 +- > include/linux/sunrpc/svc_rdma.h | 4 +- > include/linux/sunrpc/svcsock.h | 2 +- > include/linux/swap.h | 17 +- > include/linux/swapops.h | 6 +- > include/linux/thread_info.h | 10 +- > include/xen/page.h | 2 + > init/main.c | 7 +- > kernel/bpf/core.c | 9 +- > kernel/bpf/ringbuf.c | 54 ++--- > kernel/cgroup/cgroup.c | 8 +- > kernel/crash_core.c | 2 +- > kernel/events/core.c | 2 +- > kernel/fork.c | 71 +++---- > kernel/power/power.h | 2 +- > kernel/power/snapshot.c | 2 +- > kernel/power/swap.c | 129 +++++++++-- > kernel/trace/fgraph.c | 2 +- > kernel/trace/trace.c | 2 +- > lib/stackdepot.c | 6 +- > mm/kasan/report.c | 3 +- > mm/memcontrol.c | 11 +- > mm/memory.c | 4 +- > mm/mmap.c | 2 +- > mm/page-writeback.c | 2 +- > mm/page_alloc.c | 31 +-- > mm/slub.c | 2 +- > mm/sparse.c | 2 +- > mm/swapfile.c | 2 +- > mm/vmalloc.c | 7 +- > net/9p/trans_virtio.c | 4 +- > net/core/hotdata.c | 4 +- > net/core/skbuff.c | 4 +- > net/core/sysctl_net_core.c | 2 +- > net/sunrpc/cache.c | 3 +- > net/unix/af_unix.c | 2 +- > sound/soc/soc-utils.c | 4 +- > virt/kvm/kvm_main.c | 2 +- > 172 files changed, 2185 insertions(+), 951 deletions(-) > create mode 100644 arch/arm64/include/asm/pgtable-geometry.h > create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c > create mode 100644 arch/arm64/mm/pgtable-geometry.c > create mode 100644 include/asm-generic/pgtable-geometry.h > > -- > 2.43.0 This is a generally very exciting patch set! I'm looking forward to seeing it land so I can take advantage of it for Fedora ARM and Fedora Asahi Remix. That said, I have a couple of questions: * Going forward, how would we handle drivers/modules that require a particular page size? For example, the Apple Silicon IOMMU driver code requires the kernel to operate in 16k page size mode, and it would need to be disabled in other page sizes. * How would we handle an invalid selection at boot? Can we program in a fallback when the "wrong" mode is selected for a chip or something similar? Thanks again and best regards! (P.S.: Please add the asahi@ mailing list to the CC for future iterations of this patch set and tag both Hector and myself in as well. Thanks!) -- 真実はいつも一つ!/ Always, there's only one truth!
On 19/10/2024 16:47, Neal Gompa wrote: > On Monday, October 14, 2024 6:55:11 AM EDT Ryan Roberts wrote: >> Hi All, >> >> Patch bomb incoming... This covers many subsystems, so I've included a core >> set of people on the full series and additionally included maintainers on >> relevant patches. I haven't included those maintainers on this cover letter >> since the numbers were far too big for it to work. But I've included a link >> to this cover letter on each patch, so they can hopefully find their way >> here. For follow up submissions I'll break it up by subsystem, but for now >> thought it was important to show the full picture. >> >> This RFC series implements support for boot-time page size selection within >> the arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to >> date, page size has been selected at compile-time, meaning the size is >> baked into a given kernel image. As use of larger-than-4K page sizes become >> more prevalent this starts to present a problem for distributions. >> Boot-time page size selection enables the creation of a single kernel >> image, which can be told which page size to use on the kernel command line. >> >> Why is having an image-per-page size problematic? >> ================================================= >> >> Many traditional distros are now supporting both 4K and 64K. And this means >> managing 2 kernel packages, along with drivers for each. For some, it means >> multiple installer flavours and multiple ISOs. All of this adds up to a >> less-than-ideal level of complexity. Additionally, Android now supports 4K >> and 16K kernels. I'm told having to explicitly manage their KABI for each >> kernel is painful, and the extra flash space required for both kernel >> images and the duplicated modules has been problematic. Boot-time page size >> selection solves all of this. >> >> Additionally, in starting to think about the longer term deployment story >> for D128 page tables, which Arm architecture now supports, a lot of the >> same problems need to be solved, so this work sets us up nicely for that. >> >> So what's the down side? >> ======================== >> >> Well nothing's free; Various static allocations in the kernel image must be >> sized for the worst case (largest supported page size), so image size is in >> line with size of 64K compile-time image. So if you're interested in 4K or >> 16K, there is a slight increase to the image size. But I expect that >> problem goes away if you're compressing the image - its just some extra >> zeros. At boot-time, I expect we could free the unused static storage once >> we know the page size - although that would be a follow up enhancement. >> >> And then there is performance. Since PAGE_SIZE and friends are no longer >> compile-time constants, we must look up their values and do arithmetic at >> runtime instead of compile-time. My early perf testing suggests this is >> inperceptible for real-world workloads, and only has small impact on >> microbenchmarks - more on this below. >> >> Approach >> ======== >> >> The basic idea is to rid the source of any assumptions that PAGE_SIZE and >> friends are compile-time constant, but in a way that allows the compiler to >> perform the same optimizations as was previously being done if they do turn >> out to be compile-time constant. Where constants are required, we use >> limits; PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full >> description of all the classes of problems to solve. >> >> By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to >> boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. >> arm64 does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE >> Kconfig, which is an alternative to selecting a compile-time page size. >> >> When boot-time page size is active, the arch pgtable geometry macro >> definitions resolve to something that can be configured at boot. The arm64 >> implementation in this series mainly uses global, __ro_after_init >> variables. I've tried using alternatives patching, but that performs worse >> than loading from memory; I think due to code size bloat. >> >> Status >> ====== >> >> When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only implemented >> enough to compile the kernel image itself with defconfig (and a few other >> bits and pieces). This is enough to build a kernel that can boot under QEMU >> or FVP. I'll happily do the rest of the work to enable all the extra >> drivers, but wanted to get feedback on the shape of this effort first. If >> anyone wants to do any testing, and has a must-have config, let me know and >> I'll prioritize enabling it first. >> >> The series is arranged as follows: >> >> - patch 1: Add macros required for converting non-arch code to support >> boot-time page size selection >> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from >> all non-arch code >> - patches 37-38: Some arm64 tidy ups >> - patch 39: Add macros required for converting arm64 code to > support >> boot-time page size selection >> - patches 40-56: arm64 changes to support boot-time page size selection >> - patch 57: Add arm64 Kconfig option to enable boot-time page > size >> selection >> >> Ideally, I'd like to get the basics merged (something like this series), >> then incrementally improve it over a handful of kernel releases until we >> can demonstrate that we have feature parity with the compile-time build and >> no performance blockers. Once at that point, ideally the compile-time build >> options would be removed and the code could be cleaned up further. >> >> One of the bigger peices that I'd propose to add as a follow up, is to make >> va-size boot-time selectable too. That will greatly simplify LPA2 fallback >> handling. >> >> Assuming people are ammenable to the rough shape, how would I go about >> getting the non-arch changes merged? Since they cover many subsystems, will >> each piece need to go independently to each relevant maintainer or could it >> all be merged together through the arm64 tree? >> >> Image Size >> ========== >> >> The below shows the size of a defconfig (+ xfs, squashfs, ftrace, kprobes) >> kernel image on disk for base (before any changes applied), compile (with >> changes, configured for compile-time page size) and boot (with changes, >> configured for boot-time page size). >> >> You can see the that compile-16k and 64k configs are actually slightly >> smaller than the baselines; that's due to optimizing some buffer sizes >> which didn't need to depend on page size during the series. The boot-time >> image is ~1% bigger than the 64k compile-time image. I believe there is >> scope to improve this to make it >> equal to compile-64k if required: >> | config | size/KB | diff/KB | diff/% | >> | >> |-------------|---------|---------|---------| >> | >> | base-4k | 54895 | 0 | 0.0% | >> | base-16k | 55161 | 266 | 0.5% | >> | base-64k | 56775 | 1880 | 3.4% | >> | compile-4k | 54895 | 0 | 0.0% | >> | compile-16k | 55097 | 202 | 0.4% | >> | compile-64k | 56391 | 1496 | 2.7% | >> | boot-4K | 57045 | 2150 | 3.9% | >> >> And below shows the size of the image in memory at run-time, separated for >> text and data costs. The boot image has ~1% text cost; most likely due to >> the fact that PAGE_SIZE and friends are not compile-time constants so need >> instructions to load the values and do arithmetic. I believe we could >> eventually get the data cost to match the cost for the compile image for >> the chosen page size by freeing >> the ends of the static buffers not needed for the selected page size: >> | | text | text | text | data | data | data | >> | >> | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | diff/% | >> | >> |-------------|---------|---------|---------|---------|---------|---------| >> | >> | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | >> | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | >> | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | 9.5% | >> | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | >> | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | >> | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | >> | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | >> >> Functional Testing >> ================== >> >> I've build-tested defconfig for all arches supported by tuxmake (which is >> most) without issue. >> >> I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all page >> sizes and a few va-sizes, and additionally have run all the mm-selftests, >> with no regressions observed vs the equivalent compile-time page size build >> (although the mm-selftests have a few existing failures when run against >> 16K and 64K kernels - those should really be investigated and fixed >> independently). >> >> Test coverage is lacking for many of the drivers that I've touched, but in >> many cases, I'm hoping the changes are simple enough that review might >> suffice? >> >> Performance Testing >> =================== >> >> I've run some limited performance benchmarks: >> >> First, a real-world benchmark that causes a lot of page table manipulation >> (and therefore we would expect to see regression here if we are going to >> see it anywhere); kernel compilation. It barely registers a change. Values >> are times, >> so smaller is better. All relative to base-4k: >> | | kern | kern | user | user | real | real | >> | >> | config | mean | stdev | mean | stdev | mean | stdev | >> | >> |-------------|---------|---------|---------|---------|---------|---------| >> | >> | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | >> | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | >> | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | >> >> The Speedometer JavaScript benchmark also shows no change. Values are runs >> per >> min, so bigger is better. All relative to base-4k: >> | config | mean | stdev | >> | >> |-------------|---------|---------| >> | >> | base-4k | 0.0% | 0.8% | >> | compile-4k | 0.4% | 0.8% | >> | boot-4k | 0.0% | 0.9% | >> >> Finally, I've run some microbenchmarks known to stress page table >> manipulations (originally from David Hildenbrand). The fork test >> maps/allocs 1G of anon memory, then measures the cost of fork(). The munmap >> test maps/allocs 1G of anon memory then measures the cost of munmap()ing >> it. The fork test is known to be extremely sensitive to any changes that >> cause instructions to be aligned differently in cachelines. When using this >> test for other changes, I've seen double digit regressions for the >> slightest thing, so 12% regression on this test is actually fairly good. >> This likely represents the extreme worst case for regressions that will be >> observed across other microbenchmarks (famous last >> words). Values are times, so smaller is better. All relative to base-4k: >> | | fork | fork | munmap | munmap | >> | >> | config | mean | stdev | stdev | stdev | >> | >> |-------------|---------|---------|---------|---------| >> | >> | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | >> | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | >> | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | >> >> NOTE: The series applies on top of v6.11. >> >> Thanks, >> Ryan >> >> >> Ryan Roberts (57): >> mm: Add macros ahead of supporting boot-time page size selection >> vmlinux: Align to PAGE_SIZE_MAX >> mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is large >> mm/page_alloc: Make page_frag_cache boot-time page size compatible >> mm: Avoid split pmd ptl if pmd level is run-time folded >> mm: Remove PAGE_SIZE compile-time constant assumption >> fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing >> fs: Remove PAGE_SIZE compile-time constant assumption >> fs/nfs: Remove PAGE_SIZE compile-time constant assumption >> fs/ext4: Remove PAGE_SIZE compile-time constant assumption >> fork: Permit boot-time THREAD_SIZE determination >> cgroup: Remove PAGE_SIZE compile-time constant assumption >> bpf: Remove PAGE_SIZE compile-time constant assumption >> pm/hibernate: Remove PAGE_SIZE compile-time constant assumption >> stackdepot: Remove PAGE_SIZE compile-time constant assumption >> perf: Remove PAGE_SIZE compile-time constant assumption >> kvm: Remove PAGE_SIZE compile-time constant assumption >> trace: Remove PAGE_SIZE compile-time constant assumption >> crash: Remove PAGE_SIZE compile-time constant assumption >> crypto: Remove PAGE_SIZE compile-time constant assumption >> sunrpc: Remove PAGE_SIZE compile-time constant assumption >> sound: Remove PAGE_SIZE compile-time constant assumption >> net: Remove PAGE_SIZE compile-time constant assumption >> net: fec: Remove PAGE_SIZE compile-time constant assumption >> net: marvell: Remove PAGE_SIZE compile-time constant assumption >> net: hns3: Remove PAGE_SIZE compile-time constant assumption >> net: e1000: Remove PAGE_SIZE compile-time constant assumption >> net: igbvf: Remove PAGE_SIZE compile-time constant assumption >> net: igb: Remove PAGE_SIZE compile-time constant assumption >> drivers/base: Remove PAGE_SIZE compile-time constant assumption >> edac: Remove PAGE_SIZE compile-time constant assumption >> optee: Remove PAGE_SIZE compile-time constant assumption >> random: Remove PAGE_SIZE compile-time constant assumption >> sata_sil24: Remove PAGE_SIZE compile-time constant assumption >> virtio: Remove PAGE_SIZE compile-time constant assumption >> xen: Remove PAGE_SIZE compile-time constant assumption >> arm64: Fix macros to work in C code in addition to the linker script >> arm64: Track early pgtable allocation limit >> arm64: Introduce macros required for boot-time page selection >> arm64: Refactor early pgtable size calculation macros >> arm64: Pass desired page size on command line >> arm64: Divorce early init from PAGE_SIZE >> arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES >> arm64: Align sections to PAGE_SIZE_MAX >> arm64: Rework trampoline rodata mapping >> arm64: Generalize fixmap for boot-time page size >> arm64: Statically allocate and align for worst-case page size >> arm64: Convert switch to if for non-const comparison values >> arm64: Convert BUILD_BUG_ON to VM_BUG_ON >> arm64: Remove PAGE_SZ asm-offset >> arm64: Introduce cpu features for page sizes >> arm64: Remove PAGE_SIZE from assembly code >> arm64: Runtime-fold pmd level >> arm64: Support runtime folding in idmap_kpti_install_ng_mappings >> arm64: TRAMP_VALIAS is no longer compile-time constant >> arm64: Determine THREAD_SIZE at boot-time >> arm64: Enable boot-time page size selection >> >> arch/alpha/include/asm/page.h | 1 + >> arch/arc/include/asm/page.h | 1 + >> arch/arm/include/asm/page.h | 1 + >> arch/arm64/Kconfig | 26 ++- >> arch/arm64/include/asm/assembler.h | 78 ++++++- >> arch/arm64/include/asm/cpufeature.h | 44 +++- >> arch/arm64/include/asm/efi.h | 2 +- >> arch/arm64/include/asm/fixmap.h | 28 ++- >> arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- >> arch/arm64/include/asm/kvm_arm.h | 21 +- >> arch/arm64/include/asm/kvm_hyp.h | 11 + >> arch/arm64/include/asm/kvm_pgtable.h | 6 +- >> arch/arm64/include/asm/memory.h | 62 ++++-- >> arch/arm64/include/asm/page-def.h | 3 +- >> arch/arm64/include/asm/pgalloc.h | 16 +- >> arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ >> arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- >> arch/arm64/include/asm/pgtable-prot.h | 2 +- >> arch/arm64/include/asm/pgtable.h | 133 +++++++++--- >> arch/arm64/include/asm/processor.h | 10 +- >> arch/arm64/include/asm/sections.h | 1 + >> arch/arm64/include/asm/smp.h | 1 + >> arch/arm64/include/asm/sparsemem.h | 15 +- >> arch/arm64/include/asm/sysreg.h | 54 +++-- >> arch/arm64/include/asm/tlb.h | 3 + >> arch/arm64/kernel/asm-offsets.c | 4 +- >> arch/arm64/kernel/cpufeature.c | 93 ++++++-- >> arch/arm64/kernel/efi.c | 2 +- >> arch/arm64/kernel/entry.S | 60 +++++- >> arch/arm64/kernel/head.S | 46 +++- >> arch/arm64/kernel/hibernate-asm.S | 6 +- >> arch/arm64/kernel/image-vars.h | 14 ++ >> arch/arm64/kernel/image.h | 4 + >> arch/arm64/kernel/pi/idreg-override.c | 68 +++++- >> arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- >> arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- >> arch/arm64/kernel/pi/pi.h | 63 +++++- >> arch/arm64/kernel/relocate_kernel.S | 10 +- >> arch/arm64/kernel/vdso-wrap.S | 4 +- >> arch/arm64/kernel/vdso.c | 7 +- >> arch/arm64/kernel/vdso/vdso.lds.S | 4 +- >> arch/arm64/kernel/vdso32-wrap.S | 4 +- >> arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- >> arch/arm64/kernel/vmlinux.lds.S | 48 +++-- >> arch/arm64/kvm/arm.c | 10 + >> arch/arm64/kvm/hyp/nvhe/Makefile | 1 + >> arch/arm64/kvm/hyp/nvhe/host.S | 10 +- >> arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- >> arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ >> arch/arm64/kvm/mmu.c | 39 ++-- >> arch/arm64/lib/clear_page.S | 7 +- >> arch/arm64/lib/copy_page.S | 33 ++- >> arch/arm64/lib/mte.S | 27 ++- >> arch/arm64/mm/Makefile | 1 + >> arch/arm64/mm/fixmap.c | 38 ++-- >> arch/arm64/mm/hugetlbpage.c | 40 +--- >> arch/arm64/mm/init.c | 26 +-- >> arch/arm64/mm/kasan_init.c | 8 +- >> arch/arm64/mm/mmu.c | 53 +++-- >> arch/arm64/mm/pgd.c | 12 +- >> arch/arm64/mm/pgtable-geometry.c | 24 +++ >> arch/arm64/mm/proc.S | 128 ++++++++--- >> arch/arm64/mm/ptdump.c | 3 +- >> arch/arm64/tools/cpucaps | 3 + >> arch/csky/include/asm/page.h | 3 + >> arch/hexagon/include/asm/page.h | 2 + >> arch/loongarch/include/asm/page.h | 2 + >> arch/m68k/include/asm/page.h | 1 + >> arch/microblaze/include/asm/page.h | 1 + >> arch/mips/include/asm/page.h | 1 + >> arch/nios2/include/asm/page.h | 2 + >> arch/openrisc/include/asm/page.h | 1 + >> arch/parisc/include/asm/page.h | 1 + >> arch/powerpc/include/asm/page.h | 2 + >> arch/riscv/include/asm/page.h | 1 + >> arch/s390/include/asm/page.h | 1 + >> arch/sh/include/asm/page.h | 1 + >> arch/sparc/include/asm/page.h | 3 + >> arch/um/include/asm/page.h | 2 + >> arch/x86/include/asm/page_types.h | 2 + >> arch/xtensa/include/asm/page.h | 1 + >> crypto/lskcipher.c | 4 +- >> drivers/ata/sata_sil24.c | 46 ++-- >> drivers/base/node.c | 6 +- >> drivers/base/topology.c | 32 +-- >> drivers/block/virtio_blk.c | 2 +- >> drivers/char/random.c | 4 +- >> drivers/edac/edac_mc.h | 13 +- >> drivers/firmware/efi/libstub/arm64.c | 3 +- >> drivers/irqchip/irq-gic-v3-its.c | 2 +- >> drivers/mtd/mtdswap.c | 4 +- >> drivers/net/ethernet/freescale/fec.h | 3 +- >> drivers/net/ethernet/freescale/fec_main.c | 5 +- >> .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- >> drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- >> drivers/net/ethernet/intel/igb/igb.h | 25 +-- >> drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ >> drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- >> drivers/net/ethernet/marvell/mvneta.c | 9 +- >> drivers/net/ethernet/marvell/sky2.h | 2 +- >> drivers/tee/optee/call.c | 7 +- >> drivers/tee/optee/smc_abi.c | 2 +- >> drivers/virtio/virtio_balloon.c | 10 +- >> drivers/xen/balloon.c | 11 +- >> drivers/xen/biomerge.c | 12 +- >> drivers/xen/privcmd.c | 2 +- >> drivers/xen/xenbus/xenbus_client.c | 5 +- >> drivers/xen/xlate_mmu.c | 6 +- >> fs/binfmt_elf.c | 11 +- >> fs/buffer.c | 2 +- >> fs/coredump.c | 8 +- >> fs/ext4/ext4.h | 36 ++-- >> fs/ext4/move_extent.c | 2 +- >> fs/ext4/readpage.c | 2 +- >> fs/fat/dir.c | 4 +- >> fs/fat/fatent.c | 4 +- >> fs/nfs/nfs42proc.c | 2 +- >> fs/nfs/nfs42xattr.c | 2 +- >> fs/nfs/nfs4proc.c | 2 +- >> include/asm-generic/pgtable-geometry.h | 71 +++++++ >> include/asm-generic/vmlinux.lds.h | 38 ++-- >> include/linux/buffer_head.h | 1 + >> include/linux/cpumask.h | 5 + >> include/linux/linkage.h | 4 +- >> include/linux/mm.h | 17 +- >> include/linux/mm_types.h | 15 +- >> include/linux/mm_types_task.h | 2 +- >> include/linux/mmzone.h | 3 +- >> include/linux/netlink.h | 6 +- >> include/linux/percpu-defs.h | 4 +- >> include/linux/perf_event.h | 2 +- >> include/linux/sched.h | 4 +- >> include/linux/slab.h | 7 +- >> include/linux/stackdepot.h | 6 +- >> include/linux/sunrpc/svc.h | 8 +- >> include/linux/sunrpc/svc_rdma.h | 4 +- >> include/linux/sunrpc/svcsock.h | 2 +- >> include/linux/swap.h | 17 +- >> include/linux/swapops.h | 6 +- >> include/linux/thread_info.h | 10 +- >> include/xen/page.h | 2 + >> init/main.c | 7 +- >> kernel/bpf/core.c | 9 +- >> kernel/bpf/ringbuf.c | 54 ++--- >> kernel/cgroup/cgroup.c | 8 +- >> kernel/crash_core.c | 2 +- >> kernel/events/core.c | 2 +- >> kernel/fork.c | 71 +++---- >> kernel/power/power.h | 2 +- >> kernel/power/snapshot.c | 2 +- >> kernel/power/swap.c | 129 +++++++++-- >> kernel/trace/fgraph.c | 2 +- >> kernel/trace/trace.c | 2 +- >> lib/stackdepot.c | 6 +- >> mm/kasan/report.c | 3 +- >> mm/memcontrol.c | 11 +- >> mm/memory.c | 4 +- >> mm/mmap.c | 2 +- >> mm/page-writeback.c | 2 +- >> mm/page_alloc.c | 31 +-- >> mm/slub.c | 2 +- >> mm/sparse.c | 2 +- >> mm/swapfile.c | 2 +- >> mm/vmalloc.c | 7 +- >> net/9p/trans_virtio.c | 4 +- >> net/core/hotdata.c | 4 +- >> net/core/skbuff.c | 4 +- >> net/core/sysctl_net_core.c | 2 +- >> net/sunrpc/cache.c | 3 +- >> net/unix/af_unix.c | 2 +- >> sound/soc/soc-utils.c | 4 +- >> virt/kvm/kvm_main.c | 2 +- >> 172 files changed, 2185 insertions(+), 951 deletions(-) >> create mode 100644 arch/arm64/include/asm/pgtable-geometry.h >> create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c >> create mode 100644 arch/arm64/mm/pgtable-geometry.c >> create mode 100644 include/asm-generic/pgtable-geometry.h >> >> -- >> 2.43.0 > > This is a generally very exciting patch set! I'm looking forward to seeing it > land so I can take advantage of it for Fedora ARM and Fedora Asahi Remix. > > That said, I have a couple of questions: > > * Going forward, how would we handle drivers/modules that require a particular > page size? For example, the Apple Silicon IOMMU driver code requires the > kernel to operate in 16k page size mode, and it would need to be disabled in > other page sizes. I think these drivers would want to check PAGE_SIZE at probe time and fail if an unsupported page size is in use. Do you see any issue with that? > > * How would we handle an invalid selection at boot? What do you mean by invalid here? The current policy validates that the requested page size is supported by the HW by checking mmfr0. If no page size is passed on the command line, or the passed value is not supported by the HW, then the we default to the largest page size supported by the HW (so for Apple Silicon that would be 16k since the HW doesn't support 64k). Although I think it may be better to change that policy to use the smallest page size in this case; 4k is the safer bet for compat and will waste much less memory than 64k. > Can we program in a > fallback when the "wrong" mode is selected for a chip or something similar? Do you mean effectively add a machanism to force 16k if the detected HW is Apple Silicon? The trouble is that we need to select the page size, very early in boot, before start_kernel() is called, so we really only have generic arch code and the command line with which to make the decision. > > Thanks again and best regards! > > (P.S.: Please add the asahi@ mailing list to the CC for future iterations of > this patch set and tag both Hector and myself in as well. Thanks!) Will do! > >
On Mon, 21 Oct 2024 at 12:09, Ryan Roberts <ryan.roberts@arm.com> wrote: > > On 19/10/2024 16:47, Neal Gompa wrote: > > On Monday, October 14, 2024 6:55:11 AM EDT Ryan Roberts wrote: > >> Hi All, > >> > >> Patch bomb incoming... This covers many subsystems, so I've included a core > >> set of people on the full series and additionally included maintainers on > >> relevant patches. I haven't included those maintainers on this cover letter > >> since the numbers were far too big for it to work. But I've included a link > >> to this cover letter on each patch, so they can hopefully find their way > >> here. For follow up submissions I'll break it up by subsystem, but for now > >> thought it was important to show the full picture. > >> > >> This RFC series implements support for boot-time page size selection within > >> the arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to > >> date, page size has been selected at compile-time, meaning the size is > >> baked into a given kernel image. As use of larger-than-4K page sizes become > >> more prevalent this starts to present a problem for distributions. > >> Boot-time page size selection enables the creation of a single kernel > >> image, which can be told which page size to use on the kernel command line. > >> > >> Why is having an image-per-page size problematic? > >> ================================================= > >> > >> Many traditional distros are now supporting both 4K and 64K. And this means > >> managing 2 kernel packages, along with drivers for each. For some, it means > >> multiple installer flavours and multiple ISOs. All of this adds up to a > >> less-than-ideal level of complexity. Additionally, Android now supports 4K > >> and 16K kernels. I'm told having to explicitly manage their KABI for each > >> kernel is painful, and the extra flash space required for both kernel > >> images and the duplicated modules has been problematic. Boot-time page size > >> selection solves all of this. > >> > >> Additionally, in starting to think about the longer term deployment story > >> for D128 page tables, which Arm architecture now supports, a lot of the > >> same problems need to be solved, so this work sets us up nicely for that. > >> > >> So what's the down side? > >> ======================== > >> > >> Well nothing's free; Various static allocations in the kernel image must be > >> sized for the worst case (largest supported page size), so image size is in > >> line with size of 64K compile-time image. So if you're interested in 4K or > >> 16K, there is a slight increase to the image size. But I expect that > >> problem goes away if you're compressing the image - its just some extra > >> zeros. At boot-time, I expect we could free the unused static storage once > >> we know the page size - although that would be a follow up enhancement. > >> > >> And then there is performance. Since PAGE_SIZE and friends are no longer > >> compile-time constants, we must look up their values and do arithmetic at > >> runtime instead of compile-time. My early perf testing suggests this is > >> inperceptible for real-world workloads, and only has small impact on > >> microbenchmarks - more on this below. > >> > >> Approach > >> ======== > >> > >> The basic idea is to rid the source of any assumptions that PAGE_SIZE and > >> friends are compile-time constant, but in a way that allows the compiler to > >> perform the same optimizations as was previously being done if they do turn > >> out to be compile-time constant. Where constants are required, we use > >> limits; PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full > >> description of all the classes of problems to solve. > >> > >> By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to > >> boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. > >> arm64 does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE > >> Kconfig, which is an alternative to selecting a compile-time page size. > >> > >> When boot-time page size is active, the arch pgtable geometry macro > >> definitions resolve to something that can be configured at boot. The arm64 > >> implementation in this series mainly uses global, __ro_after_init > >> variables. I've tried using alternatives patching, but that performs worse > >> than loading from memory; I think due to code size bloat. > >> > >> Status > >> ====== > >> > >> When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only implemented > >> enough to compile the kernel image itself with defconfig (and a few other > >> bits and pieces). This is enough to build a kernel that can boot under QEMU > >> or FVP. I'll happily do the rest of the work to enable all the extra > >> drivers, but wanted to get feedback on the shape of this effort first. If > >> anyone wants to do any testing, and has a must-have config, let me know and > >> I'll prioritize enabling it first. > >> > >> The series is arranged as follows: > >> > >> - patch 1: Add macros required for converting non-arch code to support > >> boot-time page size selection > >> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from > >> all non-arch code > >> - patches 37-38: Some arm64 tidy ups > >> - patch 39: Add macros required for converting arm64 code to > > support > >> boot-time page size selection > >> - patches 40-56: arm64 changes to support boot-time page size selection > >> - patch 57: Add arm64 Kconfig option to enable boot-time page > > size > >> selection > >> > >> Ideally, I'd like to get the basics merged (something like this series), > >> then incrementally improve it over a handful of kernel releases until we > >> can demonstrate that we have feature parity with the compile-time build and > >> no performance blockers. Once at that point, ideally the compile-time build > >> options would be removed and the code could be cleaned up further. > >> > >> One of the bigger peices that I'd propose to add as a follow up, is to make > >> va-size boot-time selectable too. That will greatly simplify LPA2 fallback > >> handling. > >> > >> Assuming people are ammenable to the rough shape, how would I go about > >> getting the non-arch changes merged? Since they cover many subsystems, will > >> each piece need to go independently to each relevant maintainer or could it > >> all be merged together through the arm64 tree? > >> > >> Image Size > >> ========== > >> > >> The below shows the size of a defconfig (+ xfs, squashfs, ftrace, kprobes) > >> kernel image on disk for base (before any changes applied), compile (with > >> changes, configured for compile-time page size) and boot (with changes, > >> configured for boot-time page size). > >> > >> You can see the that compile-16k and 64k configs are actually slightly > >> smaller than the baselines; that's due to optimizing some buffer sizes > >> which didn't need to depend on page size during the series. The boot-time > >> image is ~1% bigger than the 64k compile-time image. I believe there is > >> scope to improve this to make it > >> equal to compile-64k if required: > >> | config | size/KB | diff/KB | diff/% | > >> | > >> |-------------|---------|---------|---------| > >> | > >> | base-4k | 54895 | 0 | 0.0% | > >> | base-16k | 55161 | 266 | 0.5% | > >> | base-64k | 56775 | 1880 | 3.4% | > >> | compile-4k | 54895 | 0 | 0.0% | > >> | compile-16k | 55097 | 202 | 0.4% | > >> | compile-64k | 56391 | 1496 | 2.7% | > >> | boot-4K | 57045 | 2150 | 3.9% | > >> > >> And below shows the size of the image in memory at run-time, separated for > >> text and data costs. The boot image has ~1% text cost; most likely due to > >> the fact that PAGE_SIZE and friends are not compile-time constants so need > >> instructions to load the values and do arithmetic. I believe we could > >> eventually get the data cost to match the cost for the compile image for > >> the chosen page size by freeing > >> the ends of the static buffers not needed for the selected page size: > >> | | text | text | text | data | data | data | > >> | > >> | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | diff/% | > >> | > >> |-------------|---------|---------|---------|---------|---------|---------| > >> | > >> | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | > >> | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | > >> | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | 9.5% | > >> | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | > >> | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | > >> | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | > >> | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | > >> > >> Functional Testing > >> ================== > >> > >> I've build-tested defconfig for all arches supported by tuxmake (which is > >> most) without issue. > >> > >> I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all page > >> sizes and a few va-sizes, and additionally have run all the mm-selftests, > >> with no regressions observed vs the equivalent compile-time page size build > >> (although the mm-selftests have a few existing failures when run against > >> 16K and 64K kernels - those should really be investigated and fixed > >> independently). > >> > >> Test coverage is lacking for many of the drivers that I've touched, but in > >> many cases, I'm hoping the changes are simple enough that review might > >> suffice? > >> > >> Performance Testing > >> =================== > >> > >> I've run some limited performance benchmarks: > >> > >> First, a real-world benchmark that causes a lot of page table manipulation > >> (and therefore we would expect to see regression here if we are going to > >> see it anywhere); kernel compilation. It barely registers a change. Values > >> are times, > >> so smaller is better. All relative to base-4k: > >> | | kern | kern | user | user | real | real | > >> | > >> | config | mean | stdev | mean | stdev | mean | stdev | > >> | > >> |-------------|---------|---------|---------|---------|---------|---------| > >> | > >> | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | > >> | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | > >> | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | > >> > >> The Speedometer JavaScript benchmark also shows no change. Values are runs > >> per > >> min, so bigger is better. All relative to base-4k: > >> | config | mean | stdev | > >> | > >> |-------------|---------|---------| > >> | > >> | base-4k | 0.0% | 0.8% | > >> | compile-4k | 0.4% | 0.8% | > >> | boot-4k | 0.0% | 0.9% | > >> > >> Finally, I've run some microbenchmarks known to stress page table > >> manipulations (originally from David Hildenbrand). The fork test > >> maps/allocs 1G of anon memory, then measures the cost of fork(). The munmap > >> test maps/allocs 1G of anon memory then measures the cost of munmap()ing > >> it. The fork test is known to be extremely sensitive to any changes that > >> cause instructions to be aligned differently in cachelines. When using this > >> test for other changes, I've seen double digit regressions for the > >> slightest thing, so 12% regression on this test is actually fairly good. > >> This likely represents the extreme worst case for regressions that will be > >> observed across other microbenchmarks (famous last > >> words). Values are times, so smaller is better. All relative to base-4k: > >> | | fork | fork | munmap | munmap | > >> | > >> | config | mean | stdev | stdev | stdev | > >> | > >> |-------------|---------|---------|---------|---------| > >> | > >> | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | > >> | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | > >> | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | > >> > >> NOTE: The series applies on top of v6.11. > >> > >> Thanks, > >> Ryan > >> > >> > >> Ryan Roberts (57): > >> mm: Add macros ahead of supporting boot-time page size selection > >> vmlinux: Align to PAGE_SIZE_MAX > >> mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is large > >> mm/page_alloc: Make page_frag_cache boot-time page size compatible > >> mm: Avoid split pmd ptl if pmd level is run-time folded > >> mm: Remove PAGE_SIZE compile-time constant assumption > >> fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing > >> fs: Remove PAGE_SIZE compile-time constant assumption > >> fs/nfs: Remove PAGE_SIZE compile-time constant assumption > >> fs/ext4: Remove PAGE_SIZE compile-time constant assumption > >> fork: Permit boot-time THREAD_SIZE determination > >> cgroup: Remove PAGE_SIZE compile-time constant assumption > >> bpf: Remove PAGE_SIZE compile-time constant assumption > >> pm/hibernate: Remove PAGE_SIZE compile-time constant assumption > >> stackdepot: Remove PAGE_SIZE compile-time constant assumption > >> perf: Remove PAGE_SIZE compile-time constant assumption > >> kvm: Remove PAGE_SIZE compile-time constant assumption > >> trace: Remove PAGE_SIZE compile-time constant assumption > >> crash: Remove PAGE_SIZE compile-time constant assumption > >> crypto: Remove PAGE_SIZE compile-time constant assumption > >> sunrpc: Remove PAGE_SIZE compile-time constant assumption > >> sound: Remove PAGE_SIZE compile-time constant assumption > >> net: Remove PAGE_SIZE compile-time constant assumption > >> net: fec: Remove PAGE_SIZE compile-time constant assumption > >> net: marvell: Remove PAGE_SIZE compile-time constant assumption > >> net: hns3: Remove PAGE_SIZE compile-time constant assumption > >> net: e1000: Remove PAGE_SIZE compile-time constant assumption > >> net: igbvf: Remove PAGE_SIZE compile-time constant assumption > >> net: igb: Remove PAGE_SIZE compile-time constant assumption > >> drivers/base: Remove PAGE_SIZE compile-time constant assumption > >> edac: Remove PAGE_SIZE compile-time constant assumption > >> optee: Remove PAGE_SIZE compile-time constant assumption > >> random: Remove PAGE_SIZE compile-time constant assumption > >> sata_sil24: Remove PAGE_SIZE compile-time constant assumption > >> virtio: Remove PAGE_SIZE compile-time constant assumption > >> xen: Remove PAGE_SIZE compile-time constant assumption > >> arm64: Fix macros to work in C code in addition to the linker script > >> arm64: Track early pgtable allocation limit > >> arm64: Introduce macros required for boot-time page selection > >> arm64: Refactor early pgtable size calculation macros > >> arm64: Pass desired page size on command line > >> arm64: Divorce early init from PAGE_SIZE > >> arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES > >> arm64: Align sections to PAGE_SIZE_MAX > >> arm64: Rework trampoline rodata mapping > >> arm64: Generalize fixmap for boot-time page size > >> arm64: Statically allocate and align for worst-case page size > >> arm64: Convert switch to if for non-const comparison values > >> arm64: Convert BUILD_BUG_ON to VM_BUG_ON > >> arm64: Remove PAGE_SZ asm-offset > >> arm64: Introduce cpu features for page sizes > >> arm64: Remove PAGE_SIZE from assembly code > >> arm64: Runtime-fold pmd level > >> arm64: Support runtime folding in idmap_kpti_install_ng_mappings > >> arm64: TRAMP_VALIAS is no longer compile-time constant > >> arm64: Determine THREAD_SIZE at boot-time > >> arm64: Enable boot-time page size selection > >> > >> arch/alpha/include/asm/page.h | 1 + > >> arch/arc/include/asm/page.h | 1 + > >> arch/arm/include/asm/page.h | 1 + > >> arch/arm64/Kconfig | 26 ++- > >> arch/arm64/include/asm/assembler.h | 78 ++++++- > >> arch/arm64/include/asm/cpufeature.h | 44 +++- > >> arch/arm64/include/asm/efi.h | 2 +- > >> arch/arm64/include/asm/fixmap.h | 28 ++- > >> arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- > >> arch/arm64/include/asm/kvm_arm.h | 21 +- > >> arch/arm64/include/asm/kvm_hyp.h | 11 + > >> arch/arm64/include/asm/kvm_pgtable.h | 6 +- > >> arch/arm64/include/asm/memory.h | 62 ++++-- > >> arch/arm64/include/asm/page-def.h | 3 +- > >> arch/arm64/include/asm/pgalloc.h | 16 +- > >> arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ > >> arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- > >> arch/arm64/include/asm/pgtable-prot.h | 2 +- > >> arch/arm64/include/asm/pgtable.h | 133 +++++++++--- > >> arch/arm64/include/asm/processor.h | 10 +- > >> arch/arm64/include/asm/sections.h | 1 + > >> arch/arm64/include/asm/smp.h | 1 + > >> arch/arm64/include/asm/sparsemem.h | 15 +- > >> arch/arm64/include/asm/sysreg.h | 54 +++-- > >> arch/arm64/include/asm/tlb.h | 3 + > >> arch/arm64/kernel/asm-offsets.c | 4 +- > >> arch/arm64/kernel/cpufeature.c | 93 ++++++-- > >> arch/arm64/kernel/efi.c | 2 +- > >> arch/arm64/kernel/entry.S | 60 +++++- > >> arch/arm64/kernel/head.S | 46 +++- > >> arch/arm64/kernel/hibernate-asm.S | 6 +- > >> arch/arm64/kernel/image-vars.h | 14 ++ > >> arch/arm64/kernel/image.h | 4 + > >> arch/arm64/kernel/pi/idreg-override.c | 68 +++++- > >> arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- > >> arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- > >> arch/arm64/kernel/pi/pi.h | 63 +++++- > >> arch/arm64/kernel/relocate_kernel.S | 10 +- > >> arch/arm64/kernel/vdso-wrap.S | 4 +- > >> arch/arm64/kernel/vdso.c | 7 +- > >> arch/arm64/kernel/vdso/vdso.lds.S | 4 +- > >> arch/arm64/kernel/vdso32-wrap.S | 4 +- > >> arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- > >> arch/arm64/kernel/vmlinux.lds.S | 48 +++-- > >> arch/arm64/kvm/arm.c | 10 + > >> arch/arm64/kvm/hyp/nvhe/Makefile | 1 + > >> arch/arm64/kvm/hyp/nvhe/host.S | 10 +- > >> arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- > >> arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ > >> arch/arm64/kvm/mmu.c | 39 ++-- > >> arch/arm64/lib/clear_page.S | 7 +- > >> arch/arm64/lib/copy_page.S | 33 ++- > >> arch/arm64/lib/mte.S | 27 ++- > >> arch/arm64/mm/Makefile | 1 + > >> arch/arm64/mm/fixmap.c | 38 ++-- > >> arch/arm64/mm/hugetlbpage.c | 40 +--- > >> arch/arm64/mm/init.c | 26 +-- > >> arch/arm64/mm/kasan_init.c | 8 +- > >> arch/arm64/mm/mmu.c | 53 +++-- > >> arch/arm64/mm/pgd.c | 12 +- > >> arch/arm64/mm/pgtable-geometry.c | 24 +++ > >> arch/arm64/mm/proc.S | 128 ++++++++--- > >> arch/arm64/mm/ptdump.c | 3 +- > >> arch/arm64/tools/cpucaps | 3 + > >> arch/csky/include/asm/page.h | 3 + > >> arch/hexagon/include/asm/page.h | 2 + > >> arch/loongarch/include/asm/page.h | 2 + > >> arch/m68k/include/asm/page.h | 1 + > >> arch/microblaze/include/asm/page.h | 1 + > >> arch/mips/include/asm/page.h | 1 + > >> arch/nios2/include/asm/page.h | 2 + > >> arch/openrisc/include/asm/page.h | 1 + > >> arch/parisc/include/asm/page.h | 1 + > >> arch/powerpc/include/asm/page.h | 2 + > >> arch/riscv/include/asm/page.h | 1 + > >> arch/s390/include/asm/page.h | 1 + > >> arch/sh/include/asm/page.h | 1 + > >> arch/sparc/include/asm/page.h | 3 + > >> arch/um/include/asm/page.h | 2 + > >> arch/x86/include/asm/page_types.h | 2 + > >> arch/xtensa/include/asm/page.h | 1 + > >> crypto/lskcipher.c | 4 +- > >> drivers/ata/sata_sil24.c | 46 ++-- > >> drivers/base/node.c | 6 +- > >> drivers/base/topology.c | 32 +-- > >> drivers/block/virtio_blk.c | 2 +- > >> drivers/char/random.c | 4 +- > >> drivers/edac/edac_mc.h | 13 +- > >> drivers/firmware/efi/libstub/arm64.c | 3 +- > >> drivers/irqchip/irq-gic-v3-its.c | 2 +- > >> drivers/mtd/mtdswap.c | 4 +- > >> drivers/net/ethernet/freescale/fec.h | 3 +- > >> drivers/net/ethernet/freescale/fec_main.c | 5 +- > >> .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- > >> drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- > >> drivers/net/ethernet/intel/igb/igb.h | 25 +-- > >> drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ > >> drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- > >> drivers/net/ethernet/marvell/mvneta.c | 9 +- > >> drivers/net/ethernet/marvell/sky2.h | 2 +- > >> drivers/tee/optee/call.c | 7 +- > >> drivers/tee/optee/smc_abi.c | 2 +- > >> drivers/virtio/virtio_balloon.c | 10 +- > >> drivers/xen/balloon.c | 11 +- > >> drivers/xen/biomerge.c | 12 +- > >> drivers/xen/privcmd.c | 2 +- > >> drivers/xen/xenbus/xenbus_client.c | 5 +- > >> drivers/xen/xlate_mmu.c | 6 +- > >> fs/binfmt_elf.c | 11 +- > >> fs/buffer.c | 2 +- > >> fs/coredump.c | 8 +- > >> fs/ext4/ext4.h | 36 ++-- > >> fs/ext4/move_extent.c | 2 +- > >> fs/ext4/readpage.c | 2 +- > >> fs/fat/dir.c | 4 +- > >> fs/fat/fatent.c | 4 +- > >> fs/nfs/nfs42proc.c | 2 +- > >> fs/nfs/nfs42xattr.c | 2 +- > >> fs/nfs/nfs4proc.c | 2 +- > >> include/asm-generic/pgtable-geometry.h | 71 +++++++ > >> include/asm-generic/vmlinux.lds.h | 38 ++-- > >> include/linux/buffer_head.h | 1 + > >> include/linux/cpumask.h | 5 + > >> include/linux/linkage.h | 4 +- > >> include/linux/mm.h | 17 +- > >> include/linux/mm_types.h | 15 +- > >> include/linux/mm_types_task.h | 2 +- > >> include/linux/mmzone.h | 3 +- > >> include/linux/netlink.h | 6 +- > >> include/linux/percpu-defs.h | 4 +- > >> include/linux/perf_event.h | 2 +- > >> include/linux/sched.h | 4 +- > >> include/linux/slab.h | 7 +- > >> include/linux/stackdepot.h | 6 +- > >> include/linux/sunrpc/svc.h | 8 +- > >> include/linux/sunrpc/svc_rdma.h | 4 +- > >> include/linux/sunrpc/svcsock.h | 2 +- > >> include/linux/swap.h | 17 +- > >> include/linux/swapops.h | 6 +- > >> include/linux/thread_info.h | 10 +- > >> include/xen/page.h | 2 + > >> init/main.c | 7 +- > >> kernel/bpf/core.c | 9 +- > >> kernel/bpf/ringbuf.c | 54 ++--- > >> kernel/cgroup/cgroup.c | 8 +- > >> kernel/crash_core.c | 2 +- > >> kernel/events/core.c | 2 +- > >> kernel/fork.c | 71 +++---- > >> kernel/power/power.h | 2 +- > >> kernel/power/snapshot.c | 2 +- > >> kernel/power/swap.c | 129 +++++++++-- > >> kernel/trace/fgraph.c | 2 +- > >> kernel/trace/trace.c | 2 +- > >> lib/stackdepot.c | 6 +- > >> mm/kasan/report.c | 3 +- > >> mm/memcontrol.c | 11 +- > >> mm/memory.c | 4 +- > >> mm/mmap.c | 2 +- > >> mm/page-writeback.c | 2 +- > >> mm/page_alloc.c | 31 +-- > >> mm/slub.c | 2 +- > >> mm/sparse.c | 2 +- > >> mm/swapfile.c | 2 +- > >> mm/vmalloc.c | 7 +- > >> net/9p/trans_virtio.c | 4 +- > >> net/core/hotdata.c | 4 +- > >> net/core/skbuff.c | 4 +- > >> net/core/sysctl_net_core.c | 2 +- > >> net/sunrpc/cache.c | 3 +- > >> net/unix/af_unix.c | 2 +- > >> sound/soc/soc-utils.c | 4 +- > >> virt/kvm/kvm_main.c | 2 +- > >> 172 files changed, 2185 insertions(+), 951 deletions(-) > >> create mode 100644 arch/arm64/include/asm/pgtable-geometry.h > >> create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c > >> create mode 100644 arch/arm64/mm/pgtable-geometry.c > >> create mode 100644 include/asm-generic/pgtable-geometry.h > >> > >> -- > >> 2.43.0 > > > > This is a generally very exciting patch set! I'm looking forward to seeing it > > land so I can take advantage of it for Fedora ARM and Fedora Asahi Remix. > > > > That said, I have a couple of questions: > > > > * Going forward, how would we handle drivers/modules that require a particular > > page size? For example, the Apple Silicon IOMMU driver code requires the > > kernel to operate in 16k page size mode, and it would need to be disabled in > > other page sizes. > > I think these drivers would want to check PAGE_SIZE at probe time and fail if an > unsupported page size is in use. Do you see any issue with that? > > > > > * How would we handle an invalid selection at boot? > > What do you mean by invalid here? The current policy validates that the > requested page size is supported by the HW by checking mmfr0. If no page size is > passed on the command line, or the passed value is not supported by the HW, then > the we default to the largest page size supported by the HW (so for Apple > Silicon that would be 16k since the HW doesn't support 64k). Although I think it > may be better to change that policy to use the smallest page size in this case; > 4k is the safer bet for compat and will waste much less memory than 64k. > > > Can we program in a > > fallback when the "wrong" mode is selected for a chip or something similar? > > Do you mean effectively add a machanism to force 16k if the detected HW is Apple > Silicon? The trouble is that we need to select the page size, very early in > boot, before start_kernel() is called, so we really only have generic arch code > and the command line with which to make the decision. Yes... I think a build-time CONFIG for default page size, which can be overridden by a karg makes sense... Even on platforms like Apple Silicon you may want to test very specific things in 4k by overriding with a karg. Like in downstream kernels like Fedora/RHEL/etc. I would expect the default would be 4k, but you could override with 16k, 64k, etc. with a karg. > > > > Thanks again and best regards! > > > > (P.S.: Please add the asahi@ mailing list to the CC for future iterations of > > this patch set and tag both Hector and myself in as well. Thanks!) > > Will do! > > > > > > >
On 21/10/2024 12:32, Eric Curtin wrote: > On Mon, 21 Oct 2024 at 12:09, Ryan Roberts <ryan.roberts@arm.com> wrote: >> >> On 19/10/2024 16:47, Neal Gompa wrote: >>> On Monday, October 14, 2024 6:55:11 AM EDT Ryan Roberts wrote: >>>> Hi All, >>>> >>>> Patch bomb incoming... This covers many subsystems, so I've included a core >>>> set of people on the full series and additionally included maintainers on >>>> relevant patches. I haven't included those maintainers on this cover letter >>>> since the numbers were far too big for it to work. But I've included a link >>>> to this cover letter on each patch, so they can hopefully find their way >>>> here. For follow up submissions I'll break it up by subsystem, but for now >>>> thought it was important to show the full picture. >>>> >>>> This RFC series implements support for boot-time page size selection within >>>> the arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to >>>> date, page size has been selected at compile-time, meaning the size is >>>> baked into a given kernel image. As use of larger-than-4K page sizes become >>>> more prevalent this starts to present a problem for distributions. >>>> Boot-time page size selection enables the creation of a single kernel >>>> image, which can be told which page size to use on the kernel command line. >>>> >>>> Why is having an image-per-page size problematic? >>>> ================================================= >>>> >>>> Many traditional distros are now supporting both 4K and 64K. And this means >>>> managing 2 kernel packages, along with drivers for each. For some, it means >>>> multiple installer flavours and multiple ISOs. All of this adds up to a >>>> less-than-ideal level of complexity. Additionally, Android now supports 4K >>>> and 16K kernels. I'm told having to explicitly manage their KABI for each >>>> kernel is painful, and the extra flash space required for both kernel >>>> images and the duplicated modules has been problematic. Boot-time page size >>>> selection solves all of this. >>>> >>>> Additionally, in starting to think about the longer term deployment story >>>> for D128 page tables, which Arm architecture now supports, a lot of the >>>> same problems need to be solved, so this work sets us up nicely for that. >>>> >>>> So what's the down side? >>>> ======================== >>>> >>>> Well nothing's free; Various static allocations in the kernel image must be >>>> sized for the worst case (largest supported page size), so image size is in >>>> line with size of 64K compile-time image. So if you're interested in 4K or >>>> 16K, there is a slight increase to the image size. But I expect that >>>> problem goes away if you're compressing the image - its just some extra >>>> zeros. At boot-time, I expect we could free the unused static storage once >>>> we know the page size - although that would be a follow up enhancement. >>>> >>>> And then there is performance. Since PAGE_SIZE and friends are no longer >>>> compile-time constants, we must look up their values and do arithmetic at >>>> runtime instead of compile-time. My early perf testing suggests this is >>>> inperceptible for real-world workloads, and only has small impact on >>>> microbenchmarks - more on this below. >>>> >>>> Approach >>>> ======== >>>> >>>> The basic idea is to rid the source of any assumptions that PAGE_SIZE and >>>> friends are compile-time constant, but in a way that allows the compiler to >>>> perform the same optimizations as was previously being done if they do turn >>>> out to be compile-time constant. Where constants are required, we use >>>> limits; PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full >>>> description of all the classes of problems to solve. >>>> >>>> By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to >>>> boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. >>>> arm64 does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE >>>> Kconfig, which is an alternative to selecting a compile-time page size. >>>> >>>> When boot-time page size is active, the arch pgtable geometry macro >>>> definitions resolve to something that can be configured at boot. The arm64 >>>> implementation in this series mainly uses global, __ro_after_init >>>> variables. I've tried using alternatives patching, but that performs worse >>>> than loading from memory; I think due to code size bloat. >>>> >>>> Status >>>> ====== >>>> >>>> When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only implemented >>>> enough to compile the kernel image itself with defconfig (and a few other >>>> bits and pieces). This is enough to build a kernel that can boot under QEMU >>>> or FVP. I'll happily do the rest of the work to enable all the extra >>>> drivers, but wanted to get feedback on the shape of this effort first. If >>>> anyone wants to do any testing, and has a must-have config, let me know and >>>> I'll prioritize enabling it first. >>>> >>>> The series is arranged as follows: >>>> >>>> - patch 1: Add macros required for converting non-arch code to support >>>> boot-time page size selection >>>> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from >>>> all non-arch code >>>> - patches 37-38: Some arm64 tidy ups >>>> - patch 39: Add macros required for converting arm64 code to >>> support >>>> boot-time page size selection >>>> - patches 40-56: arm64 changes to support boot-time page size selection >>>> - patch 57: Add arm64 Kconfig option to enable boot-time page >>> size >>>> selection >>>> >>>> Ideally, I'd like to get the basics merged (something like this series), >>>> then incrementally improve it over a handful of kernel releases until we >>>> can demonstrate that we have feature parity with the compile-time build and >>>> no performance blockers. Once at that point, ideally the compile-time build >>>> options would be removed and the code could be cleaned up further. >>>> >>>> One of the bigger peices that I'd propose to add as a follow up, is to make >>>> va-size boot-time selectable too. That will greatly simplify LPA2 fallback >>>> handling. >>>> >>>> Assuming people are ammenable to the rough shape, how would I go about >>>> getting the non-arch changes merged? Since they cover many subsystems, will >>>> each piece need to go independently to each relevant maintainer or could it >>>> all be merged together through the arm64 tree? >>>> >>>> Image Size >>>> ========== >>>> >>>> The below shows the size of a defconfig (+ xfs, squashfs, ftrace, kprobes) >>>> kernel image on disk for base (before any changes applied), compile (with >>>> changes, configured for compile-time page size) and boot (with changes, >>>> configured for boot-time page size). >>>> >>>> You can see the that compile-16k and 64k configs are actually slightly >>>> smaller than the baselines; that's due to optimizing some buffer sizes >>>> which didn't need to depend on page size during the series. The boot-time >>>> image is ~1% bigger than the 64k compile-time image. I believe there is >>>> scope to improve this to make it >>>> equal to compile-64k if required: >>>> | config | size/KB | diff/KB | diff/% | >>>> | >>>> |-------------|---------|---------|---------| >>>> | >>>> | base-4k | 54895 | 0 | 0.0% | >>>> | base-16k | 55161 | 266 | 0.5% | >>>> | base-64k | 56775 | 1880 | 3.4% | >>>> | compile-4k | 54895 | 0 | 0.0% | >>>> | compile-16k | 55097 | 202 | 0.4% | >>>> | compile-64k | 56391 | 1496 | 2.7% | >>>> | boot-4K | 57045 | 2150 | 3.9% | >>>> >>>> And below shows the size of the image in memory at run-time, separated for >>>> text and data costs. The boot image has ~1% text cost; most likely due to >>>> the fact that PAGE_SIZE and friends are not compile-time constants so need >>>> instructions to load the values and do arithmetic. I believe we could >>>> eventually get the data cost to match the cost for the compile image for >>>> the chosen page size by freeing >>>> the ends of the static buffers not needed for the selected page size: >>>> | | text | text | text | data | data | data | >>>> | >>>> | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | diff/% | >>>> | >>>> |-------------|---------|---------|---------|---------|---------|---------| >>>> | >>>> | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | >>>> | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | >>>> | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | 9.5% | >>>> | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | >>>> | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | >>>> | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | >>>> | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | >>>> >>>> Functional Testing >>>> ================== >>>> >>>> I've build-tested defconfig for all arches supported by tuxmake (which is >>>> most) without issue. >>>> >>>> I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all page >>>> sizes and a few va-sizes, and additionally have run all the mm-selftests, >>>> with no regressions observed vs the equivalent compile-time page size build >>>> (although the mm-selftests have a few existing failures when run against >>>> 16K and 64K kernels - those should really be investigated and fixed >>>> independently). >>>> >>>> Test coverage is lacking for many of the drivers that I've touched, but in >>>> many cases, I'm hoping the changes are simple enough that review might >>>> suffice? >>>> >>>> Performance Testing >>>> =================== >>>> >>>> I've run some limited performance benchmarks: >>>> >>>> First, a real-world benchmark that causes a lot of page table manipulation >>>> (and therefore we would expect to see regression here if we are going to >>>> see it anywhere); kernel compilation. It barely registers a change. Values >>>> are times, >>>> so smaller is better. All relative to base-4k: >>>> | | kern | kern | user | user | real | real | >>>> | >>>> | config | mean | stdev | mean | stdev | mean | stdev | >>>> | >>>> |-------------|---------|---------|---------|---------|---------|---------| >>>> | >>>> | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | >>>> | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | >>>> | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | >>>> >>>> The Speedometer JavaScript benchmark also shows no change. Values are runs >>>> per >>>> min, so bigger is better. All relative to base-4k: >>>> | config | mean | stdev | >>>> | >>>> |-------------|---------|---------| >>>> | >>>> | base-4k | 0.0% | 0.8% | >>>> | compile-4k | 0.4% | 0.8% | >>>> | boot-4k | 0.0% | 0.9% | >>>> >>>> Finally, I've run some microbenchmarks known to stress page table >>>> manipulations (originally from David Hildenbrand). The fork test >>>> maps/allocs 1G of anon memory, then measures the cost of fork(). The munmap >>>> test maps/allocs 1G of anon memory then measures the cost of munmap()ing >>>> it. The fork test is known to be extremely sensitive to any changes that >>>> cause instructions to be aligned differently in cachelines. When using this >>>> test for other changes, I've seen double digit regressions for the >>>> slightest thing, so 12% regression on this test is actually fairly good. >>>> This likely represents the extreme worst case for regressions that will be >>>> observed across other microbenchmarks (famous last >>>> words). Values are times, so smaller is better. All relative to base-4k: >>>> | | fork | fork | munmap | munmap | >>>> | >>>> | config | mean | stdev | stdev | stdev | >>>> | >>>> |-------------|---------|---------|---------|---------| >>>> | >>>> | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | >>>> | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | >>>> | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | >>>> >>>> NOTE: The series applies on top of v6.11. >>>> >>>> Thanks, >>>> Ryan >>>> >>>> >>>> Ryan Roberts (57): >>>> mm: Add macros ahead of supporting boot-time page size selection >>>> vmlinux: Align to PAGE_SIZE_MAX >>>> mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is large >>>> mm/page_alloc: Make page_frag_cache boot-time page size compatible >>>> mm: Avoid split pmd ptl if pmd level is run-time folded >>>> mm: Remove PAGE_SIZE compile-time constant assumption >>>> fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing >>>> fs: Remove PAGE_SIZE compile-time constant assumption >>>> fs/nfs: Remove PAGE_SIZE compile-time constant assumption >>>> fs/ext4: Remove PAGE_SIZE compile-time constant assumption >>>> fork: Permit boot-time THREAD_SIZE determination >>>> cgroup: Remove PAGE_SIZE compile-time constant assumption >>>> bpf: Remove PAGE_SIZE compile-time constant assumption >>>> pm/hibernate: Remove PAGE_SIZE compile-time constant assumption >>>> stackdepot: Remove PAGE_SIZE compile-time constant assumption >>>> perf: Remove PAGE_SIZE compile-time constant assumption >>>> kvm: Remove PAGE_SIZE compile-time constant assumption >>>> trace: Remove PAGE_SIZE compile-time constant assumption >>>> crash: Remove PAGE_SIZE compile-time constant assumption >>>> crypto: Remove PAGE_SIZE compile-time constant assumption >>>> sunrpc: Remove PAGE_SIZE compile-time constant assumption >>>> sound: Remove PAGE_SIZE compile-time constant assumption >>>> net: Remove PAGE_SIZE compile-time constant assumption >>>> net: fec: Remove PAGE_SIZE compile-time constant assumption >>>> net: marvell: Remove PAGE_SIZE compile-time constant assumption >>>> net: hns3: Remove PAGE_SIZE compile-time constant assumption >>>> net: e1000: Remove PAGE_SIZE compile-time constant assumption >>>> net: igbvf: Remove PAGE_SIZE compile-time constant assumption >>>> net: igb: Remove PAGE_SIZE compile-time constant assumption >>>> drivers/base: Remove PAGE_SIZE compile-time constant assumption >>>> edac: Remove PAGE_SIZE compile-time constant assumption >>>> optee: Remove PAGE_SIZE compile-time constant assumption >>>> random: Remove PAGE_SIZE compile-time constant assumption >>>> sata_sil24: Remove PAGE_SIZE compile-time constant assumption >>>> virtio: Remove PAGE_SIZE compile-time constant assumption >>>> xen: Remove PAGE_SIZE compile-time constant assumption >>>> arm64: Fix macros to work in C code in addition to the linker script >>>> arm64: Track early pgtable allocation limit >>>> arm64: Introduce macros required for boot-time page selection >>>> arm64: Refactor early pgtable size calculation macros >>>> arm64: Pass desired page size on command line >>>> arm64: Divorce early init from PAGE_SIZE >>>> arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES >>>> arm64: Align sections to PAGE_SIZE_MAX >>>> arm64: Rework trampoline rodata mapping >>>> arm64: Generalize fixmap for boot-time page size >>>> arm64: Statically allocate and align for worst-case page size >>>> arm64: Convert switch to if for non-const comparison values >>>> arm64: Convert BUILD_BUG_ON to VM_BUG_ON >>>> arm64: Remove PAGE_SZ asm-offset >>>> arm64: Introduce cpu features for page sizes >>>> arm64: Remove PAGE_SIZE from assembly code >>>> arm64: Runtime-fold pmd level >>>> arm64: Support runtime folding in idmap_kpti_install_ng_mappings >>>> arm64: TRAMP_VALIAS is no longer compile-time constant >>>> arm64: Determine THREAD_SIZE at boot-time >>>> arm64: Enable boot-time page size selection >>>> >>>> arch/alpha/include/asm/page.h | 1 + >>>> arch/arc/include/asm/page.h | 1 + >>>> arch/arm/include/asm/page.h | 1 + >>>> arch/arm64/Kconfig | 26 ++- >>>> arch/arm64/include/asm/assembler.h | 78 ++++++- >>>> arch/arm64/include/asm/cpufeature.h | 44 +++- >>>> arch/arm64/include/asm/efi.h | 2 +- >>>> arch/arm64/include/asm/fixmap.h | 28 ++- >>>> arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- >>>> arch/arm64/include/asm/kvm_arm.h | 21 +- >>>> arch/arm64/include/asm/kvm_hyp.h | 11 + >>>> arch/arm64/include/asm/kvm_pgtable.h | 6 +- >>>> arch/arm64/include/asm/memory.h | 62 ++++-- >>>> arch/arm64/include/asm/page-def.h | 3 +- >>>> arch/arm64/include/asm/pgalloc.h | 16 +- >>>> arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ >>>> arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- >>>> arch/arm64/include/asm/pgtable-prot.h | 2 +- >>>> arch/arm64/include/asm/pgtable.h | 133 +++++++++--- >>>> arch/arm64/include/asm/processor.h | 10 +- >>>> arch/arm64/include/asm/sections.h | 1 + >>>> arch/arm64/include/asm/smp.h | 1 + >>>> arch/arm64/include/asm/sparsemem.h | 15 +- >>>> arch/arm64/include/asm/sysreg.h | 54 +++-- >>>> arch/arm64/include/asm/tlb.h | 3 + >>>> arch/arm64/kernel/asm-offsets.c | 4 +- >>>> arch/arm64/kernel/cpufeature.c | 93 ++++++-- >>>> arch/arm64/kernel/efi.c | 2 +- >>>> arch/arm64/kernel/entry.S | 60 +++++- >>>> arch/arm64/kernel/head.S | 46 +++- >>>> arch/arm64/kernel/hibernate-asm.S | 6 +- >>>> arch/arm64/kernel/image-vars.h | 14 ++ >>>> arch/arm64/kernel/image.h | 4 + >>>> arch/arm64/kernel/pi/idreg-override.c | 68 +++++- >>>> arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- >>>> arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- >>>> arch/arm64/kernel/pi/pi.h | 63 +++++- >>>> arch/arm64/kernel/relocate_kernel.S | 10 +- >>>> arch/arm64/kernel/vdso-wrap.S | 4 +- >>>> arch/arm64/kernel/vdso.c | 7 +- >>>> arch/arm64/kernel/vdso/vdso.lds.S | 4 +- >>>> arch/arm64/kernel/vdso32-wrap.S | 4 +- >>>> arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- >>>> arch/arm64/kernel/vmlinux.lds.S | 48 +++-- >>>> arch/arm64/kvm/arm.c | 10 + >>>> arch/arm64/kvm/hyp/nvhe/Makefile | 1 + >>>> arch/arm64/kvm/hyp/nvhe/host.S | 10 +- >>>> arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- >>>> arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ >>>> arch/arm64/kvm/mmu.c | 39 ++-- >>>> arch/arm64/lib/clear_page.S | 7 +- >>>> arch/arm64/lib/copy_page.S | 33 ++- >>>> arch/arm64/lib/mte.S | 27 ++- >>>> arch/arm64/mm/Makefile | 1 + >>>> arch/arm64/mm/fixmap.c | 38 ++-- >>>> arch/arm64/mm/hugetlbpage.c | 40 +--- >>>> arch/arm64/mm/init.c | 26 +-- >>>> arch/arm64/mm/kasan_init.c | 8 +- >>>> arch/arm64/mm/mmu.c | 53 +++-- >>>> arch/arm64/mm/pgd.c | 12 +- >>>> arch/arm64/mm/pgtable-geometry.c | 24 +++ >>>> arch/arm64/mm/proc.S | 128 ++++++++--- >>>> arch/arm64/mm/ptdump.c | 3 +- >>>> arch/arm64/tools/cpucaps | 3 + >>>> arch/csky/include/asm/page.h | 3 + >>>> arch/hexagon/include/asm/page.h | 2 + >>>> arch/loongarch/include/asm/page.h | 2 + >>>> arch/m68k/include/asm/page.h | 1 + >>>> arch/microblaze/include/asm/page.h | 1 + >>>> arch/mips/include/asm/page.h | 1 + >>>> arch/nios2/include/asm/page.h | 2 + >>>> arch/openrisc/include/asm/page.h | 1 + >>>> arch/parisc/include/asm/page.h | 1 + >>>> arch/powerpc/include/asm/page.h | 2 + >>>> arch/riscv/include/asm/page.h | 1 + >>>> arch/s390/include/asm/page.h | 1 + >>>> arch/sh/include/asm/page.h | 1 + >>>> arch/sparc/include/asm/page.h | 3 + >>>> arch/um/include/asm/page.h | 2 + >>>> arch/x86/include/asm/page_types.h | 2 + >>>> arch/xtensa/include/asm/page.h | 1 + >>>> crypto/lskcipher.c | 4 +- >>>> drivers/ata/sata_sil24.c | 46 ++-- >>>> drivers/base/node.c | 6 +- >>>> drivers/base/topology.c | 32 +-- >>>> drivers/block/virtio_blk.c | 2 +- >>>> drivers/char/random.c | 4 +- >>>> drivers/edac/edac_mc.h | 13 +- >>>> drivers/firmware/efi/libstub/arm64.c | 3 +- >>>> drivers/irqchip/irq-gic-v3-its.c | 2 +- >>>> drivers/mtd/mtdswap.c | 4 +- >>>> drivers/net/ethernet/freescale/fec.h | 3 +- >>>> drivers/net/ethernet/freescale/fec_main.c | 5 +- >>>> .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- >>>> drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- >>>> drivers/net/ethernet/intel/igb/igb.h | 25 +-- >>>> drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ >>>> drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- >>>> drivers/net/ethernet/marvell/mvneta.c | 9 +- >>>> drivers/net/ethernet/marvell/sky2.h | 2 +- >>>> drivers/tee/optee/call.c | 7 +- >>>> drivers/tee/optee/smc_abi.c | 2 +- >>>> drivers/virtio/virtio_balloon.c | 10 +- >>>> drivers/xen/balloon.c | 11 +- >>>> drivers/xen/biomerge.c | 12 +- >>>> drivers/xen/privcmd.c | 2 +- >>>> drivers/xen/xenbus/xenbus_client.c | 5 +- >>>> drivers/xen/xlate_mmu.c | 6 +- >>>> fs/binfmt_elf.c | 11 +- >>>> fs/buffer.c | 2 +- >>>> fs/coredump.c | 8 +- >>>> fs/ext4/ext4.h | 36 ++-- >>>> fs/ext4/move_extent.c | 2 +- >>>> fs/ext4/readpage.c | 2 +- >>>> fs/fat/dir.c | 4 +- >>>> fs/fat/fatent.c | 4 +- >>>> fs/nfs/nfs42proc.c | 2 +- >>>> fs/nfs/nfs42xattr.c | 2 +- >>>> fs/nfs/nfs4proc.c | 2 +- >>>> include/asm-generic/pgtable-geometry.h | 71 +++++++ >>>> include/asm-generic/vmlinux.lds.h | 38 ++-- >>>> include/linux/buffer_head.h | 1 + >>>> include/linux/cpumask.h | 5 + >>>> include/linux/linkage.h | 4 +- >>>> include/linux/mm.h | 17 +- >>>> include/linux/mm_types.h | 15 +- >>>> include/linux/mm_types_task.h | 2 +- >>>> include/linux/mmzone.h | 3 +- >>>> include/linux/netlink.h | 6 +- >>>> include/linux/percpu-defs.h | 4 +- >>>> include/linux/perf_event.h | 2 +- >>>> include/linux/sched.h | 4 +- >>>> include/linux/slab.h | 7 +- >>>> include/linux/stackdepot.h | 6 +- >>>> include/linux/sunrpc/svc.h | 8 +- >>>> include/linux/sunrpc/svc_rdma.h | 4 +- >>>> include/linux/sunrpc/svcsock.h | 2 +- >>>> include/linux/swap.h | 17 +- >>>> include/linux/swapops.h | 6 +- >>>> include/linux/thread_info.h | 10 +- >>>> include/xen/page.h | 2 + >>>> init/main.c | 7 +- >>>> kernel/bpf/core.c | 9 +- >>>> kernel/bpf/ringbuf.c | 54 ++--- >>>> kernel/cgroup/cgroup.c | 8 +- >>>> kernel/crash_core.c | 2 +- >>>> kernel/events/core.c | 2 +- >>>> kernel/fork.c | 71 +++---- >>>> kernel/power/power.h | 2 +- >>>> kernel/power/snapshot.c | 2 +- >>>> kernel/power/swap.c | 129 +++++++++-- >>>> kernel/trace/fgraph.c | 2 +- >>>> kernel/trace/trace.c | 2 +- >>>> lib/stackdepot.c | 6 +- >>>> mm/kasan/report.c | 3 +- >>>> mm/memcontrol.c | 11 +- >>>> mm/memory.c | 4 +- >>>> mm/mmap.c | 2 +- >>>> mm/page-writeback.c | 2 +- >>>> mm/page_alloc.c | 31 +-- >>>> mm/slub.c | 2 +- >>>> mm/sparse.c | 2 +- >>>> mm/swapfile.c | 2 +- >>>> mm/vmalloc.c | 7 +- >>>> net/9p/trans_virtio.c | 4 +- >>>> net/core/hotdata.c | 4 +- >>>> net/core/skbuff.c | 4 +- >>>> net/core/sysctl_net_core.c | 2 +- >>>> net/sunrpc/cache.c | 3 +- >>>> net/unix/af_unix.c | 2 +- >>>> sound/soc/soc-utils.c | 4 +- >>>> virt/kvm/kvm_main.c | 2 +- >>>> 172 files changed, 2185 insertions(+), 951 deletions(-) >>>> create mode 100644 arch/arm64/include/asm/pgtable-geometry.h >>>> create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c >>>> create mode 100644 arch/arm64/mm/pgtable-geometry.c >>>> create mode 100644 include/asm-generic/pgtable-geometry.h >>>> >>>> -- >>>> 2.43.0 >>> >>> This is a generally very exciting patch set! I'm looking forward to seeing it >>> land so I can take advantage of it for Fedora ARM and Fedora Asahi Remix. >>> >>> That said, I have a couple of questions: >>> >>> * Going forward, how would we handle drivers/modules that require a particular >>> page size? For example, the Apple Silicon IOMMU driver code requires the >>> kernel to operate in 16k page size mode, and it would need to be disabled in >>> other page sizes. >> >> I think these drivers would want to check PAGE_SIZE at probe time and fail if an >> unsupported page size is in use. Do you see any issue with that? >> >>> >>> * How would we handle an invalid selection at boot? >> >> What do you mean by invalid here? The current policy validates that the >> requested page size is supported by the HW by checking mmfr0. If no page size is >> passed on the command line, or the passed value is not supported by the HW, then >> the we default to the largest page size supported by the HW (so for Apple >> Silicon that would be 16k since the HW doesn't support 64k). Although I think it >> may be better to change that policy to use the smallest page size in this case; >> 4k is the safer bet for compat and will waste much less memory than 64k. >> >>> Can we program in a >>> fallback when the "wrong" mode is selected for a chip or something similar? >> >> Do you mean effectively add a machanism to force 16k if the detected HW is Apple >> Silicon? The trouble is that we need to select the page size, very early in >> boot, before start_kernel() is called, so we really only have generic arch code >> and the command line with which to make the decision. > > Yes... I think a build-time CONFIG for default page size, which can be > overridden by a karg makes sense... Even on platforms like Apple > Silicon you may want to test very specific things in 4k by overriding > with a karg. Ahh, yes, that would certainly work. I'll work it into the next version. > > Like in downstream kernels like Fedora/RHEL/etc. I would expect the > default would be 4k, but you could override with 16k, 64k, etc. with a > karg. > >> >>>> Thanks again and best regards! >>> >>> (P.S.: Please add the asahi@ mailing list to the CC for future iterations of >>> this patch set and tag both Hector and myself in as well. Thanks!) >> >> Will do! >> >>> >>> >> >> >
On Mon, Oct 21, 2024 at 7:51 AM Ryan Roberts <ryan.roberts@arm.com> wrote: > > On 21/10/2024 12:32, Eric Curtin wrote: > > On Mon, 21 Oct 2024 at 12:09, Ryan Roberts <ryan.roberts@arm.com> wrote: > >> > >> On 19/10/2024 16:47, Neal Gompa wrote: > >>> On Monday, October 14, 2024 6:55:11 AM EDT Ryan Roberts wrote: > >>>> Hi All, > >>>> > >>>> Patch bomb incoming... This covers many subsystems, so I've included a core > >>>> set of people on the full series and additionally included maintainers on > >>>> relevant patches. I haven't included those maintainers on this cover letter > >>>> since the numbers were far too big for it to work. But I've included a link > >>>> to this cover letter on each patch, so they can hopefully find their way > >>>> here. For follow up submissions I'll break it up by subsystem, but for now > >>>> thought it was important to show the full picture. > >>>> > >>>> This RFC series implements support for boot-time page size selection within > >>>> the arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to > >>>> date, page size has been selected at compile-time, meaning the size is > >>>> baked into a given kernel image. As use of larger-than-4K page sizes become > >>>> more prevalent this starts to present a problem for distributions. > >>>> Boot-time page size selection enables the creation of a single kernel > >>>> image, which can be told which page size to use on the kernel command line. > >>>> > >>>> Why is having an image-per-page size problematic? > >>>> ================================================= > >>>> > >>>> Many traditional distros are now supporting both 4K and 64K. And this means > >>>> managing 2 kernel packages, along with drivers for each. For some, it means > >>>> multiple installer flavours and multiple ISOs. All of this adds up to a > >>>> less-than-ideal level of complexity. Additionally, Android now supports 4K > >>>> and 16K kernels. I'm told having to explicitly manage their KABI for each > >>>> kernel is painful, and the extra flash space required for both kernel > >>>> images and the duplicated modules has been problematic. Boot-time page size > >>>> selection solves all of this. > >>>> > >>>> Additionally, in starting to think about the longer term deployment story > >>>> for D128 page tables, which Arm architecture now supports, a lot of the > >>>> same problems need to be solved, so this work sets us up nicely for that. > >>>> > >>>> So what's the down side? > >>>> ======================== > >>>> > >>>> Well nothing's free; Various static allocations in the kernel image must be > >>>> sized for the worst case (largest supported page size), so image size is in > >>>> line with size of 64K compile-time image. So if you're interested in 4K or > >>>> 16K, there is a slight increase to the image size. But I expect that > >>>> problem goes away if you're compressing the image - its just some extra > >>>> zeros. At boot-time, I expect we could free the unused static storage once > >>>> we know the page size - although that would be a follow up enhancement. > >>>> > >>>> And then there is performance. Since PAGE_SIZE and friends are no longer > >>>> compile-time constants, we must look up their values and do arithmetic at > >>>> runtime instead of compile-time. My early perf testing suggests this is > >>>> inperceptible for real-world workloads, and only has small impact on > >>>> microbenchmarks - more on this below. > >>>> > >>>> Approach > >>>> ======== > >>>> > >>>> The basic idea is to rid the source of any assumptions that PAGE_SIZE and > >>>> friends are compile-time constant, but in a way that allows the compiler to > >>>> perform the same optimizations as was previously being done if they do turn > >>>> out to be compile-time constant. Where constants are required, we use > >>>> limits; PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full > >>>> description of all the classes of problems to solve. > >>>> > >>>> By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to > >>>> boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. > >>>> arm64 does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE > >>>> Kconfig, which is an alternative to selecting a compile-time page size. > >>>> > >>>> When boot-time page size is active, the arch pgtable geometry macro > >>>> definitions resolve to something that can be configured at boot. The arm64 > >>>> implementation in this series mainly uses global, __ro_after_init > >>>> variables. I've tried using alternatives patching, but that performs worse > >>>> than loading from memory; I think due to code size bloat. > >>>> > >>>> Status > >>>> ====== > >>>> > >>>> When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only implemented > >>>> enough to compile the kernel image itself with defconfig (and a few other > >>>> bits and pieces). This is enough to build a kernel that can boot under QEMU > >>>> or FVP. I'll happily do the rest of the work to enable all the extra > >>>> drivers, but wanted to get feedback on the shape of this effort first. If > >>>> anyone wants to do any testing, and has a must-have config, let me know and > >>>> I'll prioritize enabling it first. > >>>> > >>>> The series is arranged as follows: > >>>> > >>>> - patch 1: Add macros required for converting non-arch code to support > >>>> boot-time page size selection > >>>> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from > >>>> all non-arch code > >>>> - patches 37-38: Some arm64 tidy ups > >>>> - patch 39: Add macros required for converting arm64 code to > >>> support > >>>> boot-time page size selection > >>>> - patches 40-56: arm64 changes to support boot-time page size selection > >>>> - patch 57: Add arm64 Kconfig option to enable boot-time page > >>> size > >>>> selection > >>>> > >>>> Ideally, I'd like to get the basics merged (something like this series), > >>>> then incrementally improve it over a handful of kernel releases until we > >>>> can demonstrate that we have feature parity with the compile-time build and > >>>> no performance blockers. Once at that point, ideally the compile-time build > >>>> options would be removed and the code could be cleaned up further. > >>>> > >>>> One of the bigger peices that I'd propose to add as a follow up, is to make > >>>> va-size boot-time selectable too. That will greatly simplify LPA2 fallback > >>>> handling. > >>>> > >>>> Assuming people are ammenable to the rough shape, how would I go about > >>>> getting the non-arch changes merged? Since they cover many subsystems, will > >>>> each piece need to go independently to each relevant maintainer or could it > >>>> all be merged together through the arm64 tree? > >>>> > >>>> Image Size > >>>> ========== > >>>> > >>>> The below shows the size of a defconfig (+ xfs, squashfs, ftrace, kprobes) > >>>> kernel image on disk for base (before any changes applied), compile (with > >>>> changes, configured for compile-time page size) and boot (with changes, > >>>> configured for boot-time page size). > >>>> > >>>> You can see the that compile-16k and 64k configs are actually slightly > >>>> smaller than the baselines; that's due to optimizing some buffer sizes > >>>> which didn't need to depend on page size during the series. The boot-time > >>>> image is ~1% bigger than the 64k compile-time image. I believe there is > >>>> scope to improve this to make it > >>>> equal to compile-64k if required: > >>>> | config | size/KB | diff/KB | diff/% | > >>>> | > >>>> |-------------|---------|---------|---------| > >>>> | > >>>> | base-4k | 54895 | 0 | 0.0% | > >>>> | base-16k | 55161 | 266 | 0.5% | > >>>> | base-64k | 56775 | 1880 | 3.4% | > >>>> | compile-4k | 54895 | 0 | 0.0% | > >>>> | compile-16k | 55097 | 202 | 0.4% | > >>>> | compile-64k | 56391 | 1496 | 2.7% | > >>>> | boot-4K | 57045 | 2150 | 3.9% | > >>>> > >>>> And below shows the size of the image in memory at run-time, separated for > >>>> text and data costs. The boot image has ~1% text cost; most likely due to > >>>> the fact that PAGE_SIZE and friends are not compile-time constants so need > >>>> instructions to load the values and do arithmetic. I believe we could > >>>> eventually get the data cost to match the cost for the compile image for > >>>> the chosen page size by freeing > >>>> the ends of the static buffers not needed for the selected page size: > >>>> | | text | text | text | data | data | data | > >>>> | > >>>> | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | diff/% | > >>>> | > >>>> |-------------|---------|---------|---------|---------|---------|---------| > >>>> | > >>>> | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | > >>>> | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | > >>>> | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | 9.5% | > >>>> | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | > >>>> | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | > >>>> | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | > >>>> | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | > >>>> > >>>> Functional Testing > >>>> ================== > >>>> > >>>> I've build-tested defconfig for all arches supported by tuxmake (which is > >>>> most) without issue. > >>>> > >>>> I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all page > >>>> sizes and a few va-sizes, and additionally have run all the mm-selftests, > >>>> with no regressions observed vs the equivalent compile-time page size build > >>>> (although the mm-selftests have a few existing failures when run against > >>>> 16K and 64K kernels - those should really be investigated and fixed > >>>> independently). > >>>> > >>>> Test coverage is lacking for many of the drivers that I've touched, but in > >>>> many cases, I'm hoping the changes are simple enough that review might > >>>> suffice? > >>>> > >>>> Performance Testing > >>>> =================== > >>>> > >>>> I've run some limited performance benchmarks: > >>>> > >>>> First, a real-world benchmark that causes a lot of page table manipulation > >>>> (and therefore we would expect to see regression here if we are going to > >>>> see it anywhere); kernel compilation. It barely registers a change. Values > >>>> are times, > >>>> so smaller is better. All relative to base-4k: > >>>> | | kern | kern | user | user | real | real | > >>>> | > >>>> | config | mean | stdev | mean | stdev | mean | stdev | > >>>> | > >>>> |-------------|---------|---------|---------|---------|---------|---------| > >>>> | > >>>> | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | > >>>> | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | > >>>> | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | > >>>> > >>>> The Speedometer JavaScript benchmark also shows no change. Values are runs > >>>> per > >>>> min, so bigger is better. All relative to base-4k: > >>>> | config | mean | stdev | > >>>> | > >>>> |-------------|---------|---------| > >>>> | > >>>> | base-4k | 0.0% | 0.8% | > >>>> | compile-4k | 0.4% | 0.8% | > >>>> | boot-4k | 0.0% | 0.9% | > >>>> > >>>> Finally, I've run some microbenchmarks known to stress page table > >>>> manipulations (originally from David Hildenbrand). The fork test > >>>> maps/allocs 1G of anon memory, then measures the cost of fork(). The munmap > >>>> test maps/allocs 1G of anon memory then measures the cost of munmap()ing > >>>> it. The fork test is known to be extremely sensitive to any changes that > >>>> cause instructions to be aligned differently in cachelines. When using this > >>>> test for other changes, I've seen double digit regressions for the > >>>> slightest thing, so 12% regression on this test is actually fairly good. > >>>> This likely represents the extreme worst case for regressions that will be > >>>> observed across other microbenchmarks (famous last > >>>> words). Values are times, so smaller is better. All relative to base-4k: > >>>> | | fork | fork | munmap | munmap | > >>>> | > >>>> | config | mean | stdev | stdev | stdev | > >>>> | > >>>> |-------------|---------|---------|---------|---------| > >>>> | > >>>> | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | > >>>> | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | > >>>> | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | > >>>> > >>>> NOTE: The series applies on top of v6.11. > >>>> > >>>> Thanks, > >>>> Ryan > >>>> > >>>> > >>>> Ryan Roberts (57): > >>>> mm: Add macros ahead of supporting boot-time page size selection > >>>> vmlinux: Align to PAGE_SIZE_MAX > >>>> mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is large > >>>> mm/page_alloc: Make page_frag_cache boot-time page size compatible > >>>> mm: Avoid split pmd ptl if pmd level is run-time folded > >>>> mm: Remove PAGE_SIZE compile-time constant assumption > >>>> fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing > >>>> fs: Remove PAGE_SIZE compile-time constant assumption > >>>> fs/nfs: Remove PAGE_SIZE compile-time constant assumption > >>>> fs/ext4: Remove PAGE_SIZE compile-time constant assumption > >>>> fork: Permit boot-time THREAD_SIZE determination > >>>> cgroup: Remove PAGE_SIZE compile-time constant assumption > >>>> bpf: Remove PAGE_SIZE compile-time constant assumption > >>>> pm/hibernate: Remove PAGE_SIZE compile-time constant assumption > >>>> stackdepot: Remove PAGE_SIZE compile-time constant assumption > >>>> perf: Remove PAGE_SIZE compile-time constant assumption > >>>> kvm: Remove PAGE_SIZE compile-time constant assumption > >>>> trace: Remove PAGE_SIZE compile-time constant assumption > >>>> crash: Remove PAGE_SIZE compile-time constant assumption > >>>> crypto: Remove PAGE_SIZE compile-time constant assumption > >>>> sunrpc: Remove PAGE_SIZE compile-time constant assumption > >>>> sound: Remove PAGE_SIZE compile-time constant assumption > >>>> net: Remove PAGE_SIZE compile-time constant assumption > >>>> net: fec: Remove PAGE_SIZE compile-time constant assumption > >>>> net: marvell: Remove PAGE_SIZE compile-time constant assumption > >>>> net: hns3: Remove PAGE_SIZE compile-time constant assumption > >>>> net: e1000: Remove PAGE_SIZE compile-time constant assumption > >>>> net: igbvf: Remove PAGE_SIZE compile-time constant assumption > >>>> net: igb: Remove PAGE_SIZE compile-time constant assumption > >>>> drivers/base: Remove PAGE_SIZE compile-time constant assumption > >>>> edac: Remove PAGE_SIZE compile-time constant assumption > >>>> optee: Remove PAGE_SIZE compile-time constant assumption > >>>> random: Remove PAGE_SIZE compile-time constant assumption > >>>> sata_sil24: Remove PAGE_SIZE compile-time constant assumption > >>>> virtio: Remove PAGE_SIZE compile-time constant assumption > >>>> xen: Remove PAGE_SIZE compile-time constant assumption > >>>> arm64: Fix macros to work in C code in addition to the linker script > >>>> arm64: Track early pgtable allocation limit > >>>> arm64: Introduce macros required for boot-time page selection > >>>> arm64: Refactor early pgtable size calculation macros > >>>> arm64: Pass desired page size on command line > >>>> arm64: Divorce early init from PAGE_SIZE > >>>> arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES > >>>> arm64: Align sections to PAGE_SIZE_MAX > >>>> arm64: Rework trampoline rodata mapping > >>>> arm64: Generalize fixmap for boot-time page size > >>>> arm64: Statically allocate and align for worst-case page size > >>>> arm64: Convert switch to if for non-const comparison values > >>>> arm64: Convert BUILD_BUG_ON to VM_BUG_ON > >>>> arm64: Remove PAGE_SZ asm-offset > >>>> arm64: Introduce cpu features for page sizes > >>>> arm64: Remove PAGE_SIZE from assembly code > >>>> arm64: Runtime-fold pmd level > >>>> arm64: Support runtime folding in idmap_kpti_install_ng_mappings > >>>> arm64: TRAMP_VALIAS is no longer compile-time constant > >>>> arm64: Determine THREAD_SIZE at boot-time > >>>> arm64: Enable boot-time page size selection > >>>> > >>>> arch/alpha/include/asm/page.h | 1 + > >>>> arch/arc/include/asm/page.h | 1 + > >>>> arch/arm/include/asm/page.h | 1 + > >>>> arch/arm64/Kconfig | 26 ++- > >>>> arch/arm64/include/asm/assembler.h | 78 ++++++- > >>>> arch/arm64/include/asm/cpufeature.h | 44 +++- > >>>> arch/arm64/include/asm/efi.h | 2 +- > >>>> arch/arm64/include/asm/fixmap.h | 28 ++- > >>>> arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- > >>>> arch/arm64/include/asm/kvm_arm.h | 21 +- > >>>> arch/arm64/include/asm/kvm_hyp.h | 11 + > >>>> arch/arm64/include/asm/kvm_pgtable.h | 6 +- > >>>> arch/arm64/include/asm/memory.h | 62 ++++-- > >>>> arch/arm64/include/asm/page-def.h | 3 +- > >>>> arch/arm64/include/asm/pgalloc.h | 16 +- > >>>> arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ > >>>> arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- > >>>> arch/arm64/include/asm/pgtable-prot.h | 2 +- > >>>> arch/arm64/include/asm/pgtable.h | 133 +++++++++--- > >>>> arch/arm64/include/asm/processor.h | 10 +- > >>>> arch/arm64/include/asm/sections.h | 1 + > >>>> arch/arm64/include/asm/smp.h | 1 + > >>>> arch/arm64/include/asm/sparsemem.h | 15 +- > >>>> arch/arm64/include/asm/sysreg.h | 54 +++-- > >>>> arch/arm64/include/asm/tlb.h | 3 + > >>>> arch/arm64/kernel/asm-offsets.c | 4 +- > >>>> arch/arm64/kernel/cpufeature.c | 93 ++++++-- > >>>> arch/arm64/kernel/efi.c | 2 +- > >>>> arch/arm64/kernel/entry.S | 60 +++++- > >>>> arch/arm64/kernel/head.S | 46 +++- > >>>> arch/arm64/kernel/hibernate-asm.S | 6 +- > >>>> arch/arm64/kernel/image-vars.h | 14 ++ > >>>> arch/arm64/kernel/image.h | 4 + > >>>> arch/arm64/kernel/pi/idreg-override.c | 68 +++++- > >>>> arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- > >>>> arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- > >>>> arch/arm64/kernel/pi/pi.h | 63 +++++- > >>>> arch/arm64/kernel/relocate_kernel.S | 10 +- > >>>> arch/arm64/kernel/vdso-wrap.S | 4 +- > >>>> arch/arm64/kernel/vdso.c | 7 +- > >>>> arch/arm64/kernel/vdso/vdso.lds.S | 4 +- > >>>> arch/arm64/kernel/vdso32-wrap.S | 4 +- > >>>> arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- > >>>> arch/arm64/kernel/vmlinux.lds.S | 48 +++-- > >>>> arch/arm64/kvm/arm.c | 10 + > >>>> arch/arm64/kvm/hyp/nvhe/Makefile | 1 + > >>>> arch/arm64/kvm/hyp/nvhe/host.S | 10 +- > >>>> arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- > >>>> arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ > >>>> arch/arm64/kvm/mmu.c | 39 ++-- > >>>> arch/arm64/lib/clear_page.S | 7 +- > >>>> arch/arm64/lib/copy_page.S | 33 ++- > >>>> arch/arm64/lib/mte.S | 27 ++- > >>>> arch/arm64/mm/Makefile | 1 + > >>>> arch/arm64/mm/fixmap.c | 38 ++-- > >>>> arch/arm64/mm/hugetlbpage.c | 40 +--- > >>>> arch/arm64/mm/init.c | 26 +-- > >>>> arch/arm64/mm/kasan_init.c | 8 +- > >>>> arch/arm64/mm/mmu.c | 53 +++-- > >>>> arch/arm64/mm/pgd.c | 12 +- > >>>> arch/arm64/mm/pgtable-geometry.c | 24 +++ > >>>> arch/arm64/mm/proc.S | 128 ++++++++--- > >>>> arch/arm64/mm/ptdump.c | 3 +- > >>>> arch/arm64/tools/cpucaps | 3 + > >>>> arch/csky/include/asm/page.h | 3 + > >>>> arch/hexagon/include/asm/page.h | 2 + > >>>> arch/loongarch/include/asm/page.h | 2 + > >>>> arch/m68k/include/asm/page.h | 1 + > >>>> arch/microblaze/include/asm/page.h | 1 + > >>>> arch/mips/include/asm/page.h | 1 + > >>>> arch/nios2/include/asm/page.h | 2 + > >>>> arch/openrisc/include/asm/page.h | 1 + > >>>> arch/parisc/include/asm/page.h | 1 + > >>>> arch/powerpc/include/asm/page.h | 2 + > >>>> arch/riscv/include/asm/page.h | 1 + > >>>> arch/s390/include/asm/page.h | 1 + > >>>> arch/sh/include/asm/page.h | 1 + > >>>> arch/sparc/include/asm/page.h | 3 + > >>>> arch/um/include/asm/page.h | 2 + > >>>> arch/x86/include/asm/page_types.h | 2 + > >>>> arch/xtensa/include/asm/page.h | 1 + > >>>> crypto/lskcipher.c | 4 +- > >>>> drivers/ata/sata_sil24.c | 46 ++-- > >>>> drivers/base/node.c | 6 +- > >>>> drivers/base/topology.c | 32 +-- > >>>> drivers/block/virtio_blk.c | 2 +- > >>>> drivers/char/random.c | 4 +- > >>>> drivers/edac/edac_mc.h | 13 +- > >>>> drivers/firmware/efi/libstub/arm64.c | 3 +- > >>>> drivers/irqchip/irq-gic-v3-its.c | 2 +- > >>>> drivers/mtd/mtdswap.c | 4 +- > >>>> drivers/net/ethernet/freescale/fec.h | 3 +- > >>>> drivers/net/ethernet/freescale/fec_main.c | 5 +- > >>>> .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- > >>>> drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- > >>>> drivers/net/ethernet/intel/igb/igb.h | 25 +-- > >>>> drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ > >>>> drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- > >>>> drivers/net/ethernet/marvell/mvneta.c | 9 +- > >>>> drivers/net/ethernet/marvell/sky2.h | 2 +- > >>>> drivers/tee/optee/call.c | 7 +- > >>>> drivers/tee/optee/smc_abi.c | 2 +- > >>>> drivers/virtio/virtio_balloon.c | 10 +- > >>>> drivers/xen/balloon.c | 11 +- > >>>> drivers/xen/biomerge.c | 12 +- > >>>> drivers/xen/privcmd.c | 2 +- > >>>> drivers/xen/xenbus/xenbus_client.c | 5 +- > >>>> drivers/xen/xlate_mmu.c | 6 +- > >>>> fs/binfmt_elf.c | 11 +- > >>>> fs/buffer.c | 2 +- > >>>> fs/coredump.c | 8 +- > >>>> fs/ext4/ext4.h | 36 ++-- > >>>> fs/ext4/move_extent.c | 2 +- > >>>> fs/ext4/readpage.c | 2 +- > >>>> fs/fat/dir.c | 4 +- > >>>> fs/fat/fatent.c | 4 +- > >>>> fs/nfs/nfs42proc.c | 2 +- > >>>> fs/nfs/nfs42xattr.c | 2 +- > >>>> fs/nfs/nfs4proc.c | 2 +- > >>>> include/asm-generic/pgtable-geometry.h | 71 +++++++ > >>>> include/asm-generic/vmlinux.lds.h | 38 ++-- > >>>> include/linux/buffer_head.h | 1 + > >>>> include/linux/cpumask.h | 5 + > >>>> include/linux/linkage.h | 4 +- > >>>> include/linux/mm.h | 17 +- > >>>> include/linux/mm_types.h | 15 +- > >>>> include/linux/mm_types_task.h | 2 +- > >>>> include/linux/mmzone.h | 3 +- > >>>> include/linux/netlink.h | 6 +- > >>>> include/linux/percpu-defs.h | 4 +- > >>>> include/linux/perf_event.h | 2 +- > >>>> include/linux/sched.h | 4 +- > >>>> include/linux/slab.h | 7 +- > >>>> include/linux/stackdepot.h | 6 +- > >>>> include/linux/sunrpc/svc.h | 8 +- > >>>> include/linux/sunrpc/svc_rdma.h | 4 +- > >>>> include/linux/sunrpc/svcsock.h | 2 +- > >>>> include/linux/swap.h | 17 +- > >>>> include/linux/swapops.h | 6 +- > >>>> include/linux/thread_info.h | 10 +- > >>>> include/xen/page.h | 2 + > >>>> init/main.c | 7 +- > >>>> kernel/bpf/core.c | 9 +- > >>>> kernel/bpf/ringbuf.c | 54 ++--- > >>>> kernel/cgroup/cgroup.c | 8 +- > >>>> kernel/crash_core.c | 2 +- > >>>> kernel/events/core.c | 2 +- > >>>> kernel/fork.c | 71 +++---- > >>>> kernel/power/power.h | 2 +- > >>>> kernel/power/snapshot.c | 2 +- > >>>> kernel/power/swap.c | 129 +++++++++-- > >>>> kernel/trace/fgraph.c | 2 +- > >>>> kernel/trace/trace.c | 2 +- > >>>> lib/stackdepot.c | 6 +- > >>>> mm/kasan/report.c | 3 +- > >>>> mm/memcontrol.c | 11 +- > >>>> mm/memory.c | 4 +- > >>>> mm/mmap.c | 2 +- > >>>> mm/page-writeback.c | 2 +- > >>>> mm/page_alloc.c | 31 +-- > >>>> mm/slub.c | 2 +- > >>>> mm/sparse.c | 2 +- > >>>> mm/swapfile.c | 2 +- > >>>> mm/vmalloc.c | 7 +- > >>>> net/9p/trans_virtio.c | 4 +- > >>>> net/core/hotdata.c | 4 +- > >>>> net/core/skbuff.c | 4 +- > >>>> net/core/sysctl_net_core.c | 2 +- > >>>> net/sunrpc/cache.c | 3 +- > >>>> net/unix/af_unix.c | 2 +- > >>>> sound/soc/soc-utils.c | 4 +- > >>>> virt/kvm/kvm_main.c | 2 +- > >>>> 172 files changed, 2185 insertions(+), 951 deletions(-) > >>>> create mode 100644 arch/arm64/include/asm/pgtable-geometry.h > >>>> create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c > >>>> create mode 100644 arch/arm64/mm/pgtable-geometry.c > >>>> create mode 100644 include/asm-generic/pgtable-geometry.h > >>>> > >>>> -- > >>>> 2.43.0 > >>> > >>> This is a generally very exciting patch set! I'm looking forward to seeing it > >>> land so I can take advantage of it for Fedora ARM and Fedora Asahi Remix. > >>> > >>> That said, I have a couple of questions: > >>> > >>> * Going forward, how would we handle drivers/modules that require a particular > >>> page size? For example, the Apple Silicon IOMMU driver code requires the > >>> kernel to operate in 16k page size mode, and it would need to be disabled in > >>> other page sizes. > >> > >> I think these drivers would want to check PAGE_SIZE at probe time and fail if an > >> unsupported page size is in use. Do you see any issue with that? > >> > >>> > >>> * How would we handle an invalid selection at boot? > >> > >> What do you mean by invalid here? The current policy validates that the > >> requested page size is supported by the HW by checking mmfr0. If no page size is > >> passed on the command line, or the passed value is not supported by the HW, then > >> the we default to the largest page size supported by the HW (so for Apple > >> Silicon that would be 16k since the HW doesn't support 64k). Although I think it > >> may be better to change that policy to use the smallest page size in this case; > >> 4k is the safer bet for compat and will waste much less memory than 64k. > >> > >>> Can we program in a > >>> fallback when the "wrong" mode is selected for a chip or something similar? > >> > >> Do you mean effectively add a machanism to force 16k if the detected HW is Apple > >> Silicon? The trouble is that we need to select the page size, very early in > >> boot, before start_kernel() is called, so we really only have generic arch code > >> and the command line with which to make the decision. > > > > Yes... I think a build-time CONFIG for default page size, which can be > > overridden by a karg makes sense... Even on platforms like Apple > > Silicon you may want to test very specific things in 4k by overriding > > with a karg. > > Ahh, yes, that would certainly work. I'll work it into the next version. > Could we maybe extend to have some kind of way to include a table of SoC IDs that certain modes are disabled (e.g. 64k on Apple Silicon) and preferred modes when no arg is set (16k for Apple Silicon)? That way it'd work something like this: 1. Table identification of 4/16/64 depending on identified SoC 2. Unidentified ones follow build-time default 3. karg forces a mode regardless -- 真実はいつも一つ!/ Always, there's only one truth!
On 21/10/2024 14:49, Neal Gompa wrote: > On Mon, Oct 21, 2024 at 7:51 AM Ryan Roberts <ryan.roberts@arm.com> wrote: >> >> On 21/10/2024 12:32, Eric Curtin wrote: >>> On Mon, 21 Oct 2024 at 12:09, Ryan Roberts <ryan.roberts@arm.com> wrote: >>>> >>>> On 19/10/2024 16:47, Neal Gompa wrote: >>>>> On Monday, October 14, 2024 6:55:11 AM EDT Ryan Roberts wrote: >>>>>> Hi All, >>>>>> >>>>>> Patch bomb incoming... This covers many subsystems, so I've included a core >>>>>> set of people on the full series and additionally included maintainers on >>>>>> relevant patches. I haven't included those maintainers on this cover letter >>>>>> since the numbers were far too big for it to work. But I've included a link >>>>>> to this cover letter on each patch, so they can hopefully find their way >>>>>> here. For follow up submissions I'll break it up by subsystem, but for now >>>>>> thought it was important to show the full picture. >>>>>> >>>>>> This RFC series implements support for boot-time page size selection within >>>>>> the arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to >>>>>> date, page size has been selected at compile-time, meaning the size is >>>>>> baked into a given kernel image. As use of larger-than-4K page sizes become >>>>>> more prevalent this starts to present a problem for distributions. >>>>>> Boot-time page size selection enables the creation of a single kernel >>>>>> image, which can be told which page size to use on the kernel command line. >>>>>> >>>>>> Why is having an image-per-page size problematic? >>>>>> ================================================= >>>>>> >>>>>> Many traditional distros are now supporting both 4K and 64K. And this means >>>>>> managing 2 kernel packages, along with drivers for each. For some, it means >>>>>> multiple installer flavours and multiple ISOs. All of this adds up to a >>>>>> less-than-ideal level of complexity. Additionally, Android now supports 4K >>>>>> and 16K kernels. I'm told having to explicitly manage their KABI for each >>>>>> kernel is painful, and the extra flash space required for both kernel >>>>>> images and the duplicated modules has been problematic. Boot-time page size >>>>>> selection solves all of this. >>>>>> >>>>>> Additionally, in starting to think about the longer term deployment story >>>>>> for D128 page tables, which Arm architecture now supports, a lot of the >>>>>> same problems need to be solved, so this work sets us up nicely for that. >>>>>> >>>>>> So what's the down side? >>>>>> ======================== >>>>>> >>>>>> Well nothing's free; Various static allocations in the kernel image must be >>>>>> sized for the worst case (largest supported page size), so image size is in >>>>>> line with size of 64K compile-time image. So if you're interested in 4K or >>>>>> 16K, there is a slight increase to the image size. But I expect that >>>>>> problem goes away if you're compressing the image - its just some extra >>>>>> zeros. At boot-time, I expect we could free the unused static storage once >>>>>> we know the page size - although that would be a follow up enhancement. >>>>>> >>>>>> And then there is performance. Since PAGE_SIZE and friends are no longer >>>>>> compile-time constants, we must look up their values and do arithmetic at >>>>>> runtime instead of compile-time. My early perf testing suggests this is >>>>>> inperceptible for real-world workloads, and only has small impact on >>>>>> microbenchmarks - more on this below. >>>>>> >>>>>> Approach >>>>>> ======== >>>>>> >>>>>> The basic idea is to rid the source of any assumptions that PAGE_SIZE and >>>>>> friends are compile-time constant, but in a way that allows the compiler to >>>>>> perform the same optimizations as was previously being done if they do turn >>>>>> out to be compile-time constant. Where constants are required, we use >>>>>> limits; PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full >>>>>> description of all the classes of problems to solve. >>>>>> >>>>>> By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to >>>>>> boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. >>>>>> arm64 does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE >>>>>> Kconfig, which is an alternative to selecting a compile-time page size. >>>>>> >>>>>> When boot-time page size is active, the arch pgtable geometry macro >>>>>> definitions resolve to something that can be configured at boot. The arm64 >>>>>> implementation in this series mainly uses global, __ro_after_init >>>>>> variables. I've tried using alternatives patching, but that performs worse >>>>>> than loading from memory; I think due to code size bloat. >>>>>> >>>>>> Status >>>>>> ====== >>>>>> >>>>>> When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only implemented >>>>>> enough to compile the kernel image itself with defconfig (and a few other >>>>>> bits and pieces). This is enough to build a kernel that can boot under QEMU >>>>>> or FVP. I'll happily do the rest of the work to enable all the extra >>>>>> drivers, but wanted to get feedback on the shape of this effort first. If >>>>>> anyone wants to do any testing, and has a must-have config, let me know and >>>>>> I'll prioritize enabling it first. >>>>>> >>>>>> The series is arranged as follows: >>>>>> >>>>>> - patch 1: Add macros required for converting non-arch code to support >>>>>> boot-time page size selection >>>>>> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from >>>>>> all non-arch code >>>>>> - patches 37-38: Some arm64 tidy ups >>>>>> - patch 39: Add macros required for converting arm64 code to >>>>> support >>>>>> boot-time page size selection >>>>>> - patches 40-56: arm64 changes to support boot-time page size selection >>>>>> - patch 57: Add arm64 Kconfig option to enable boot-time page >>>>> size >>>>>> selection >>>>>> >>>>>> Ideally, I'd like to get the basics merged (something like this series), >>>>>> then incrementally improve it over a handful of kernel releases until we >>>>>> can demonstrate that we have feature parity with the compile-time build and >>>>>> no performance blockers. Once at that point, ideally the compile-time build >>>>>> options would be removed and the code could be cleaned up further. >>>>>> >>>>>> One of the bigger peices that I'd propose to add as a follow up, is to make >>>>>> va-size boot-time selectable too. That will greatly simplify LPA2 fallback >>>>>> handling. >>>>>> >>>>>> Assuming people are ammenable to the rough shape, how would I go about >>>>>> getting the non-arch changes merged? Since they cover many subsystems, will >>>>>> each piece need to go independently to each relevant maintainer or could it >>>>>> all be merged together through the arm64 tree? >>>>>> >>>>>> Image Size >>>>>> ========== >>>>>> >>>>>> The below shows the size of a defconfig (+ xfs, squashfs, ftrace, kprobes) >>>>>> kernel image on disk for base (before any changes applied), compile (with >>>>>> changes, configured for compile-time page size) and boot (with changes, >>>>>> configured for boot-time page size). >>>>>> >>>>>> You can see the that compile-16k and 64k configs are actually slightly >>>>>> smaller than the baselines; that's due to optimizing some buffer sizes >>>>>> which didn't need to depend on page size during the series. The boot-time >>>>>> image is ~1% bigger than the 64k compile-time image. I believe there is >>>>>> scope to improve this to make it >>>>>> equal to compile-64k if required: >>>>>> | config | size/KB | diff/KB | diff/% | >>>>>> | >>>>>> |-------------|---------|---------|---------| >>>>>> | >>>>>> | base-4k | 54895 | 0 | 0.0% | >>>>>> | base-16k | 55161 | 266 | 0.5% | >>>>>> | base-64k | 56775 | 1880 | 3.4% | >>>>>> | compile-4k | 54895 | 0 | 0.0% | >>>>>> | compile-16k | 55097 | 202 | 0.4% | >>>>>> | compile-64k | 56391 | 1496 | 2.7% | >>>>>> | boot-4K | 57045 | 2150 | 3.9% | >>>>>> >>>>>> And below shows the size of the image in memory at run-time, separated for >>>>>> text and data costs. The boot image has ~1% text cost; most likely due to >>>>>> the fact that PAGE_SIZE and friends are not compile-time constants so need >>>>>> instructions to load the values and do arithmetic. I believe we could >>>>>> eventually get the data cost to match the cost for the compile image for >>>>>> the chosen page size by freeing >>>>>> the ends of the static buffers not needed for the selected page size: >>>>>> | | text | text | text | data | data | data | >>>>>> | >>>>>> | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | diff/% | >>>>>> | >>>>>> |-------------|---------|---------|---------|---------|---------|---------| >>>>>> | >>>>>> | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | >>>>>> | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | >>>>>> | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | 9.5% | >>>>>> | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | >>>>>> | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | >>>>>> | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | >>>>>> | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | >>>>>> >>>>>> Functional Testing >>>>>> ================== >>>>>> >>>>>> I've build-tested defconfig for all arches supported by tuxmake (which is >>>>>> most) without issue. >>>>>> >>>>>> I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all page >>>>>> sizes and a few va-sizes, and additionally have run all the mm-selftests, >>>>>> with no regressions observed vs the equivalent compile-time page size build >>>>>> (although the mm-selftests have a few existing failures when run against >>>>>> 16K and 64K kernels - those should really be investigated and fixed >>>>>> independently). >>>>>> >>>>>> Test coverage is lacking for many of the drivers that I've touched, but in >>>>>> many cases, I'm hoping the changes are simple enough that review might >>>>>> suffice? >>>>>> >>>>>> Performance Testing >>>>>> =================== >>>>>> >>>>>> I've run some limited performance benchmarks: >>>>>> >>>>>> First, a real-world benchmark that causes a lot of page table manipulation >>>>>> (and therefore we would expect to see regression here if we are going to >>>>>> see it anywhere); kernel compilation. It barely registers a change. Values >>>>>> are times, >>>>>> so smaller is better. All relative to base-4k: >>>>>> | | kern | kern | user | user | real | real | >>>>>> | >>>>>> | config | mean | stdev | mean | stdev | mean | stdev | >>>>>> | >>>>>> |-------------|---------|---------|---------|---------|---------|---------| >>>>>> | >>>>>> | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | >>>>>> | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | >>>>>> | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | >>>>>> >>>>>> The Speedometer JavaScript benchmark also shows no change. Values are runs >>>>>> per >>>>>> min, so bigger is better. All relative to base-4k: >>>>>> | config | mean | stdev | >>>>>> | >>>>>> |-------------|---------|---------| >>>>>> | >>>>>> | base-4k | 0.0% | 0.8% | >>>>>> | compile-4k | 0.4% | 0.8% | >>>>>> | boot-4k | 0.0% | 0.9% | >>>>>> >>>>>> Finally, I've run some microbenchmarks known to stress page table >>>>>> manipulations (originally from David Hildenbrand). The fork test >>>>>> maps/allocs 1G of anon memory, then measures the cost of fork(). The munmap >>>>>> test maps/allocs 1G of anon memory then measures the cost of munmap()ing >>>>>> it. The fork test is known to be extremely sensitive to any changes that >>>>>> cause instructions to be aligned differently in cachelines. When using this >>>>>> test for other changes, I've seen double digit regressions for the >>>>>> slightest thing, so 12% regression on this test is actually fairly good. >>>>>> This likely represents the extreme worst case for regressions that will be >>>>>> observed across other microbenchmarks (famous last >>>>>> words). Values are times, so smaller is better. All relative to base-4k: >>>>>> | | fork | fork | munmap | munmap | >>>>>> | >>>>>> | config | mean | stdev | stdev | stdev | >>>>>> | >>>>>> |-------------|---------|---------|---------|---------| >>>>>> | >>>>>> | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | >>>>>> | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | >>>>>> | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | >>>>>> >>>>>> NOTE: The series applies on top of v6.11. >>>>>> >>>>>> Thanks, >>>>>> Ryan >>>>>> >>>>>> >>>>>> Ryan Roberts (57): >>>>>> mm: Add macros ahead of supporting boot-time page size selection >>>>>> vmlinux: Align to PAGE_SIZE_MAX >>>>>> mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is large >>>>>> mm/page_alloc: Make page_frag_cache boot-time page size compatible >>>>>> mm: Avoid split pmd ptl if pmd level is run-time folded >>>>>> mm: Remove PAGE_SIZE compile-time constant assumption >>>>>> fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing >>>>>> fs: Remove PAGE_SIZE compile-time constant assumption >>>>>> fs/nfs: Remove PAGE_SIZE compile-time constant assumption >>>>>> fs/ext4: Remove PAGE_SIZE compile-time constant assumption >>>>>> fork: Permit boot-time THREAD_SIZE determination >>>>>> cgroup: Remove PAGE_SIZE compile-time constant assumption >>>>>> bpf: Remove PAGE_SIZE compile-time constant assumption >>>>>> pm/hibernate: Remove PAGE_SIZE compile-time constant assumption >>>>>> stackdepot: Remove PAGE_SIZE compile-time constant assumption >>>>>> perf: Remove PAGE_SIZE compile-time constant assumption >>>>>> kvm: Remove PAGE_SIZE compile-time constant assumption >>>>>> trace: Remove PAGE_SIZE compile-time constant assumption >>>>>> crash: Remove PAGE_SIZE compile-time constant assumption >>>>>> crypto: Remove PAGE_SIZE compile-time constant assumption >>>>>> sunrpc: Remove PAGE_SIZE compile-time constant assumption >>>>>> sound: Remove PAGE_SIZE compile-time constant assumption >>>>>> net: Remove PAGE_SIZE compile-time constant assumption >>>>>> net: fec: Remove PAGE_SIZE compile-time constant assumption >>>>>> net: marvell: Remove PAGE_SIZE compile-time constant assumption >>>>>> net: hns3: Remove PAGE_SIZE compile-time constant assumption >>>>>> net: e1000: Remove PAGE_SIZE compile-time constant assumption >>>>>> net: igbvf: Remove PAGE_SIZE compile-time constant assumption >>>>>> net: igb: Remove PAGE_SIZE compile-time constant assumption >>>>>> drivers/base: Remove PAGE_SIZE compile-time constant assumption >>>>>> edac: Remove PAGE_SIZE compile-time constant assumption >>>>>> optee: Remove PAGE_SIZE compile-time constant assumption >>>>>> random: Remove PAGE_SIZE compile-time constant assumption >>>>>> sata_sil24: Remove PAGE_SIZE compile-time constant assumption >>>>>> virtio: Remove PAGE_SIZE compile-time constant assumption >>>>>> xen: Remove PAGE_SIZE compile-time constant assumption >>>>>> arm64: Fix macros to work in C code in addition to the linker script >>>>>> arm64: Track early pgtable allocation limit >>>>>> arm64: Introduce macros required for boot-time page selection >>>>>> arm64: Refactor early pgtable size calculation macros >>>>>> arm64: Pass desired page size on command line >>>>>> arm64: Divorce early init from PAGE_SIZE >>>>>> arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES >>>>>> arm64: Align sections to PAGE_SIZE_MAX >>>>>> arm64: Rework trampoline rodata mapping >>>>>> arm64: Generalize fixmap for boot-time page size >>>>>> arm64: Statically allocate and align for worst-case page size >>>>>> arm64: Convert switch to if for non-const comparison values >>>>>> arm64: Convert BUILD_BUG_ON to VM_BUG_ON >>>>>> arm64: Remove PAGE_SZ asm-offset >>>>>> arm64: Introduce cpu features for page sizes >>>>>> arm64: Remove PAGE_SIZE from assembly code >>>>>> arm64: Runtime-fold pmd level >>>>>> arm64: Support runtime folding in idmap_kpti_install_ng_mappings >>>>>> arm64: TRAMP_VALIAS is no longer compile-time constant >>>>>> arm64: Determine THREAD_SIZE at boot-time >>>>>> arm64: Enable boot-time page size selection >>>>>> >>>>>> arch/alpha/include/asm/page.h | 1 + >>>>>> arch/arc/include/asm/page.h | 1 + >>>>>> arch/arm/include/asm/page.h | 1 + >>>>>> arch/arm64/Kconfig | 26 ++- >>>>>> arch/arm64/include/asm/assembler.h | 78 ++++++- >>>>>> arch/arm64/include/asm/cpufeature.h | 44 +++- >>>>>> arch/arm64/include/asm/efi.h | 2 +- >>>>>> arch/arm64/include/asm/fixmap.h | 28 ++- >>>>>> arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- >>>>>> arch/arm64/include/asm/kvm_arm.h | 21 +- >>>>>> arch/arm64/include/asm/kvm_hyp.h | 11 + >>>>>> arch/arm64/include/asm/kvm_pgtable.h | 6 +- >>>>>> arch/arm64/include/asm/memory.h | 62 ++++-- >>>>>> arch/arm64/include/asm/page-def.h | 3 +- >>>>>> arch/arm64/include/asm/pgalloc.h | 16 +- >>>>>> arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ >>>>>> arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- >>>>>> arch/arm64/include/asm/pgtable-prot.h | 2 +- >>>>>> arch/arm64/include/asm/pgtable.h | 133 +++++++++--- >>>>>> arch/arm64/include/asm/processor.h | 10 +- >>>>>> arch/arm64/include/asm/sections.h | 1 + >>>>>> arch/arm64/include/asm/smp.h | 1 + >>>>>> arch/arm64/include/asm/sparsemem.h | 15 +- >>>>>> arch/arm64/include/asm/sysreg.h | 54 +++-- >>>>>> arch/arm64/include/asm/tlb.h | 3 + >>>>>> arch/arm64/kernel/asm-offsets.c | 4 +- >>>>>> arch/arm64/kernel/cpufeature.c | 93 ++++++-- >>>>>> arch/arm64/kernel/efi.c | 2 +- >>>>>> arch/arm64/kernel/entry.S | 60 +++++- >>>>>> arch/arm64/kernel/head.S | 46 +++- >>>>>> arch/arm64/kernel/hibernate-asm.S | 6 +- >>>>>> arch/arm64/kernel/image-vars.h | 14 ++ >>>>>> arch/arm64/kernel/image.h | 4 + >>>>>> arch/arm64/kernel/pi/idreg-override.c | 68 +++++- >>>>>> arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- >>>>>> arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- >>>>>> arch/arm64/kernel/pi/pi.h | 63 +++++- >>>>>> arch/arm64/kernel/relocate_kernel.S | 10 +- >>>>>> arch/arm64/kernel/vdso-wrap.S | 4 +- >>>>>> arch/arm64/kernel/vdso.c | 7 +- >>>>>> arch/arm64/kernel/vdso/vdso.lds.S | 4 +- >>>>>> arch/arm64/kernel/vdso32-wrap.S | 4 +- >>>>>> arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- >>>>>> arch/arm64/kernel/vmlinux.lds.S | 48 +++-- >>>>>> arch/arm64/kvm/arm.c | 10 + >>>>>> arch/arm64/kvm/hyp/nvhe/Makefile | 1 + >>>>>> arch/arm64/kvm/hyp/nvhe/host.S | 10 +- >>>>>> arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- >>>>>> arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ >>>>>> arch/arm64/kvm/mmu.c | 39 ++-- >>>>>> arch/arm64/lib/clear_page.S | 7 +- >>>>>> arch/arm64/lib/copy_page.S | 33 ++- >>>>>> arch/arm64/lib/mte.S | 27 ++- >>>>>> arch/arm64/mm/Makefile | 1 + >>>>>> arch/arm64/mm/fixmap.c | 38 ++-- >>>>>> arch/arm64/mm/hugetlbpage.c | 40 +--- >>>>>> arch/arm64/mm/init.c | 26 +-- >>>>>> arch/arm64/mm/kasan_init.c | 8 +- >>>>>> arch/arm64/mm/mmu.c | 53 +++-- >>>>>> arch/arm64/mm/pgd.c | 12 +- >>>>>> arch/arm64/mm/pgtable-geometry.c | 24 +++ >>>>>> arch/arm64/mm/proc.S | 128 ++++++++--- >>>>>> arch/arm64/mm/ptdump.c | 3 +- >>>>>> arch/arm64/tools/cpucaps | 3 + >>>>>> arch/csky/include/asm/page.h | 3 + >>>>>> arch/hexagon/include/asm/page.h | 2 + >>>>>> arch/loongarch/include/asm/page.h | 2 + >>>>>> arch/m68k/include/asm/page.h | 1 + >>>>>> arch/microblaze/include/asm/page.h | 1 + >>>>>> arch/mips/include/asm/page.h | 1 + >>>>>> arch/nios2/include/asm/page.h | 2 + >>>>>> arch/openrisc/include/asm/page.h | 1 + >>>>>> arch/parisc/include/asm/page.h | 1 + >>>>>> arch/powerpc/include/asm/page.h | 2 + >>>>>> arch/riscv/include/asm/page.h | 1 + >>>>>> arch/s390/include/asm/page.h | 1 + >>>>>> arch/sh/include/asm/page.h | 1 + >>>>>> arch/sparc/include/asm/page.h | 3 + >>>>>> arch/um/include/asm/page.h | 2 + >>>>>> arch/x86/include/asm/page_types.h | 2 + >>>>>> arch/xtensa/include/asm/page.h | 1 + >>>>>> crypto/lskcipher.c | 4 +- >>>>>> drivers/ata/sata_sil24.c | 46 ++-- >>>>>> drivers/base/node.c | 6 +- >>>>>> drivers/base/topology.c | 32 +-- >>>>>> drivers/block/virtio_blk.c | 2 +- >>>>>> drivers/char/random.c | 4 +- >>>>>> drivers/edac/edac_mc.h | 13 +- >>>>>> drivers/firmware/efi/libstub/arm64.c | 3 +- >>>>>> drivers/irqchip/irq-gic-v3-its.c | 2 +- >>>>>> drivers/mtd/mtdswap.c | 4 +- >>>>>> drivers/net/ethernet/freescale/fec.h | 3 +- >>>>>> drivers/net/ethernet/freescale/fec_main.c | 5 +- >>>>>> .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- >>>>>> drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- >>>>>> drivers/net/ethernet/intel/igb/igb.h | 25 +-- >>>>>> drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ >>>>>> drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- >>>>>> drivers/net/ethernet/marvell/mvneta.c | 9 +- >>>>>> drivers/net/ethernet/marvell/sky2.h | 2 +- >>>>>> drivers/tee/optee/call.c | 7 +- >>>>>> drivers/tee/optee/smc_abi.c | 2 +- >>>>>> drivers/virtio/virtio_balloon.c | 10 +- >>>>>> drivers/xen/balloon.c | 11 +- >>>>>> drivers/xen/biomerge.c | 12 +- >>>>>> drivers/xen/privcmd.c | 2 +- >>>>>> drivers/xen/xenbus/xenbus_client.c | 5 +- >>>>>> drivers/xen/xlate_mmu.c | 6 +- >>>>>> fs/binfmt_elf.c | 11 +- >>>>>> fs/buffer.c | 2 +- >>>>>> fs/coredump.c | 8 +- >>>>>> fs/ext4/ext4.h | 36 ++-- >>>>>> fs/ext4/move_extent.c | 2 +- >>>>>> fs/ext4/readpage.c | 2 +- >>>>>> fs/fat/dir.c | 4 +- >>>>>> fs/fat/fatent.c | 4 +- >>>>>> fs/nfs/nfs42proc.c | 2 +- >>>>>> fs/nfs/nfs42xattr.c | 2 +- >>>>>> fs/nfs/nfs4proc.c | 2 +- >>>>>> include/asm-generic/pgtable-geometry.h | 71 +++++++ >>>>>> include/asm-generic/vmlinux.lds.h | 38 ++-- >>>>>> include/linux/buffer_head.h | 1 + >>>>>> include/linux/cpumask.h | 5 + >>>>>> include/linux/linkage.h | 4 +- >>>>>> include/linux/mm.h | 17 +- >>>>>> include/linux/mm_types.h | 15 +- >>>>>> include/linux/mm_types_task.h | 2 +- >>>>>> include/linux/mmzone.h | 3 +- >>>>>> include/linux/netlink.h | 6 +- >>>>>> include/linux/percpu-defs.h | 4 +- >>>>>> include/linux/perf_event.h | 2 +- >>>>>> include/linux/sched.h | 4 +- >>>>>> include/linux/slab.h | 7 +- >>>>>> include/linux/stackdepot.h | 6 +- >>>>>> include/linux/sunrpc/svc.h | 8 +- >>>>>> include/linux/sunrpc/svc_rdma.h | 4 +- >>>>>> include/linux/sunrpc/svcsock.h | 2 +- >>>>>> include/linux/swap.h | 17 +- >>>>>> include/linux/swapops.h | 6 +- >>>>>> include/linux/thread_info.h | 10 +- >>>>>> include/xen/page.h | 2 + >>>>>> init/main.c | 7 +- >>>>>> kernel/bpf/core.c | 9 +- >>>>>> kernel/bpf/ringbuf.c | 54 ++--- >>>>>> kernel/cgroup/cgroup.c | 8 +- >>>>>> kernel/crash_core.c | 2 +- >>>>>> kernel/events/core.c | 2 +- >>>>>> kernel/fork.c | 71 +++---- >>>>>> kernel/power/power.h | 2 +- >>>>>> kernel/power/snapshot.c | 2 +- >>>>>> kernel/power/swap.c | 129 +++++++++-- >>>>>> kernel/trace/fgraph.c | 2 +- >>>>>> kernel/trace/trace.c | 2 +- >>>>>> lib/stackdepot.c | 6 +- >>>>>> mm/kasan/report.c | 3 +- >>>>>> mm/memcontrol.c | 11 +- >>>>>> mm/memory.c | 4 +- >>>>>> mm/mmap.c | 2 +- >>>>>> mm/page-writeback.c | 2 +- >>>>>> mm/page_alloc.c | 31 +-- >>>>>> mm/slub.c | 2 +- >>>>>> mm/sparse.c | 2 +- >>>>>> mm/swapfile.c | 2 +- >>>>>> mm/vmalloc.c | 7 +- >>>>>> net/9p/trans_virtio.c | 4 +- >>>>>> net/core/hotdata.c | 4 +- >>>>>> net/core/skbuff.c | 4 +- >>>>>> net/core/sysctl_net_core.c | 2 +- >>>>>> net/sunrpc/cache.c | 3 +- >>>>>> net/unix/af_unix.c | 2 +- >>>>>> sound/soc/soc-utils.c | 4 +- >>>>>> virt/kvm/kvm_main.c | 2 +- >>>>>> 172 files changed, 2185 insertions(+), 951 deletions(-) >>>>>> create mode 100644 arch/arm64/include/asm/pgtable-geometry.h >>>>>> create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c >>>>>> create mode 100644 arch/arm64/mm/pgtable-geometry.c >>>>>> create mode 100644 include/asm-generic/pgtable-geometry.h >>>>>> >>>>>> -- >>>>>> 2.43.0 >>>>> >>>>> This is a generally very exciting patch set! I'm looking forward to seeing it >>>>> land so I can take advantage of it for Fedora ARM and Fedora Asahi Remix. >>>>> >>>>> That said, I have a couple of questions: >>>>> >>>>> * Going forward, how would we handle drivers/modules that require a particular >>>>> page size? For example, the Apple Silicon IOMMU driver code requires the >>>>> kernel to operate in 16k page size mode, and it would need to be disabled in >>>>> other page sizes. >>>> >>>> I think these drivers would want to check PAGE_SIZE at probe time and fail if an >>>> unsupported page size is in use. Do you see any issue with that? >>>> >>>>> >>>>> * How would we handle an invalid selection at boot? >>>> >>>> What do you mean by invalid here? The current policy validates that the >>>> requested page size is supported by the HW by checking mmfr0. If no page size is >>>> passed on the command line, or the passed value is not supported by the HW, then >>>> the we default to the largest page size supported by the HW (so for Apple >>>> Silicon that would be 16k since the HW doesn't support 64k). Although I think it >>>> may be better to change that policy to use the smallest page size in this case; >>>> 4k is the safer bet for compat and will waste much less memory than 64k. >>>> >>>>> Can we program in a >>>>> fallback when the "wrong" mode is selected for a chip or something similar? >>>> >>>> Do you mean effectively add a machanism to force 16k if the detected HW is Apple >>>> Silicon? The trouble is that we need to select the page size, very early in >>>> boot, before start_kernel() is called, so we really only have generic arch code >>>> and the command line with which to make the decision. >>> >>> Yes... I think a build-time CONFIG for default page size, which can be >>> overridden by a karg makes sense... Even on platforms like Apple >>> Silicon you may want to test very specific things in 4k by overriding >>> with a karg. >> >> Ahh, yes, that would certainly work. I'll work it into the next version. >> > > Could we maybe extend to have some kind of way to include a table of > SoC IDs that certain modes are disabled (e.g. 64k on Apple Silicon) 64k is already disabled on Apple Silicon because mmfr0 reports that 64k is not supported. > and preferred modes when no arg is set (16k for Apple Silicon)? That And it's not obvious that we should hard-code a page size preference to a SoC ID. If the CPU can support multiple page sizes, it should be up to the SW stack to decide, not the SoC. I'm guessing your desire is to have a single kernel build that will boot 16k by default on Apple Silicon and 4k by default on other systems, all without needing to modify the command line? Personally I think it's cleaner to just require setting the page size on the command line in these cases. > way it'd work something like this: > > 1. Table identification of 4/16/64 depending on identified SoC So I'd prefer not to have this > 2. Unidentified ones follow build-time default > 3. karg forces a mode regardless But keep these 2. > >
On Mon, Oct 21, 2024 at 11:02 AM Ryan Roberts <ryan.roberts@arm.com> wrote: > > On 21/10/2024 14:49, Neal Gompa wrote: > > On Mon, Oct 21, 2024 at 7:51 AM Ryan Roberts <ryan.roberts@arm.com> wrote: > >> > >> On 21/10/2024 12:32, Eric Curtin wrote: > >>> On Mon, 21 Oct 2024 at 12:09, Ryan Roberts <ryan.roberts@arm.com> wrote: > >>>> > >>>> On 19/10/2024 16:47, Neal Gompa wrote: > >>>>> On Monday, October 14, 2024 6:55:11 AM EDT Ryan Roberts wrote: > >>>>>> Hi All, > >>>>>> > >>>>>> Patch bomb incoming... This covers many subsystems, so I've included a core > >>>>>> set of people on the full series and additionally included maintainers on > >>>>>> relevant patches. I haven't included those maintainers on this cover letter > >>>>>> since the numbers were far too big for it to work. But I've included a link > >>>>>> to this cover letter on each patch, so they can hopefully find their way > >>>>>> here. For follow up submissions I'll break it up by subsystem, but for now > >>>>>> thought it was important to show the full picture. > >>>>>> > >>>>>> This RFC series implements support for boot-time page size selection within > >>>>>> the arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to > >>>>>> date, page size has been selected at compile-time, meaning the size is > >>>>>> baked into a given kernel image. As use of larger-than-4K page sizes become > >>>>>> more prevalent this starts to present a problem for distributions. > >>>>>> Boot-time page size selection enables the creation of a single kernel > >>>>>> image, which can be told which page size to use on the kernel command line. > >>>>>> > >>>>>> Why is having an image-per-page size problematic? > >>>>>> ================================================= > >>>>>> > >>>>>> Many traditional distros are now supporting both 4K and 64K. And this means > >>>>>> managing 2 kernel packages, along with drivers for each. For some, it means > >>>>>> multiple installer flavours and multiple ISOs. All of this adds up to a > >>>>>> less-than-ideal level of complexity. Additionally, Android now supports 4K > >>>>>> and 16K kernels. I'm told having to explicitly manage their KABI for each > >>>>>> kernel is painful, and the extra flash space required for both kernel > >>>>>> images and the duplicated modules has been problematic. Boot-time page size > >>>>>> selection solves all of this. > >>>>>> > >>>>>> Additionally, in starting to think about the longer term deployment story > >>>>>> for D128 page tables, which Arm architecture now supports, a lot of the > >>>>>> same problems need to be solved, so this work sets us up nicely for that. > >>>>>> > >>>>>> So what's the down side? > >>>>>> ======================== > >>>>>> > >>>>>> Well nothing's free; Various static allocations in the kernel image must be > >>>>>> sized for the worst case (largest supported page size), so image size is in > >>>>>> line with size of 64K compile-time image. So if you're interested in 4K or > >>>>>> 16K, there is a slight increase to the image size. But I expect that > >>>>>> problem goes away if you're compressing the image - its just some extra > >>>>>> zeros. At boot-time, I expect we could free the unused static storage once > >>>>>> we know the page size - although that would be a follow up enhancement. > >>>>>> > >>>>>> And then there is performance. Since PAGE_SIZE and friends are no longer > >>>>>> compile-time constants, we must look up their values and do arithmetic at > >>>>>> runtime instead of compile-time. My early perf testing suggests this is > >>>>>> inperceptible for real-world workloads, and only has small impact on > >>>>>> microbenchmarks - more on this below. > >>>>>> > >>>>>> Approach > >>>>>> ======== > >>>>>> > >>>>>> The basic idea is to rid the source of any assumptions that PAGE_SIZE and > >>>>>> friends are compile-time constant, but in a way that allows the compiler to > >>>>>> perform the same optimizations as was previously being done if they do turn > >>>>>> out to be compile-time constant. Where constants are required, we use > >>>>>> limits; PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full > >>>>>> description of all the classes of problems to solve. > >>>>>> > >>>>>> By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to > >>>>>> boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. > >>>>>> arm64 does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE > >>>>>> Kconfig, which is an alternative to selecting a compile-time page size. > >>>>>> > >>>>>> When boot-time page size is active, the arch pgtable geometry macro > >>>>>> definitions resolve to something that can be configured at boot. The arm64 > >>>>>> implementation in this series mainly uses global, __ro_after_init > >>>>>> variables. I've tried using alternatives patching, but that performs worse > >>>>>> than loading from memory; I think due to code size bloat. > >>>>>> > >>>>>> Status > >>>>>> ====== > >>>>>> > >>>>>> When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only implemented > >>>>>> enough to compile the kernel image itself with defconfig (and a few other > >>>>>> bits and pieces). This is enough to build a kernel that can boot under QEMU > >>>>>> or FVP. I'll happily do the rest of the work to enable all the extra > >>>>>> drivers, but wanted to get feedback on the shape of this effort first. If > >>>>>> anyone wants to do any testing, and has a must-have config, let me know and > >>>>>> I'll prioritize enabling it first. > >>>>>> > >>>>>> The series is arranged as follows: > >>>>>> > >>>>>> - patch 1: Add macros required for converting non-arch code to support > >>>>>> boot-time page size selection > >>>>>> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from > >>>>>> all non-arch code > >>>>>> - patches 37-38: Some arm64 tidy ups > >>>>>> - patch 39: Add macros required for converting arm64 code to > >>>>> support > >>>>>> boot-time page size selection > >>>>>> - patches 40-56: arm64 changes to support boot-time page size selection > >>>>>> - patch 57: Add arm64 Kconfig option to enable boot-time page > >>>>> size > >>>>>> selection > >>>>>> > >>>>>> Ideally, I'd like to get the basics merged (something like this series), > >>>>>> then incrementally improve it over a handful of kernel releases until we > >>>>>> can demonstrate that we have feature parity with the compile-time build and > >>>>>> no performance blockers. Once at that point, ideally the compile-time build > >>>>>> options would be removed and the code could be cleaned up further. > >>>>>> > >>>>>> One of the bigger peices that I'd propose to add as a follow up, is to make > >>>>>> va-size boot-time selectable too. That will greatly simplify LPA2 fallback > >>>>>> handling. > >>>>>> > >>>>>> Assuming people are ammenable to the rough shape, how would I go about > >>>>>> getting the non-arch changes merged? Since they cover many subsystems, will > >>>>>> each piece need to go independently to each relevant maintainer or could it > >>>>>> all be merged together through the arm64 tree? > >>>>>> > >>>>>> Image Size > >>>>>> ========== > >>>>>> > >>>>>> The below shows the size of a defconfig (+ xfs, squashfs, ftrace, kprobes) > >>>>>> kernel image on disk for base (before any changes applied), compile (with > >>>>>> changes, configured for compile-time page size) and boot (with changes, > >>>>>> configured for boot-time page size). > >>>>>> > >>>>>> You can see the that compile-16k and 64k configs are actually slightly > >>>>>> smaller than the baselines; that's due to optimizing some buffer sizes > >>>>>> which didn't need to depend on page size during the series. The boot-time > >>>>>> image is ~1% bigger than the 64k compile-time image. I believe there is > >>>>>> scope to improve this to make it > >>>>>> equal to compile-64k if required: > >>>>>> | config | size/KB | diff/KB | diff/% | > >>>>>> | > >>>>>> |-------------|---------|---------|---------| > >>>>>> | > >>>>>> | base-4k | 54895 | 0 | 0.0% | > >>>>>> | base-16k | 55161 | 266 | 0.5% | > >>>>>> | base-64k | 56775 | 1880 | 3.4% | > >>>>>> | compile-4k | 54895 | 0 | 0.0% | > >>>>>> | compile-16k | 55097 | 202 | 0.4% | > >>>>>> | compile-64k | 56391 | 1496 | 2.7% | > >>>>>> | boot-4K | 57045 | 2150 | 3.9% | > >>>>>> > >>>>>> And below shows the size of the image in memory at run-time, separated for > >>>>>> text and data costs. The boot image has ~1% text cost; most likely due to > >>>>>> the fact that PAGE_SIZE and friends are not compile-time constants so need > >>>>>> instructions to load the values and do arithmetic. I believe we could > >>>>>> eventually get the data cost to match the cost for the compile image for > >>>>>> the chosen page size by freeing > >>>>>> the ends of the static buffers not needed for the selected page size: > >>>>>> | | text | text | text | data | data | data | > >>>>>> | > >>>>>> | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | diff/% | > >>>>>> | > >>>>>> |-------------|---------|---------|---------|---------|---------|---------| > >>>>>> | > >>>>>> | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | > >>>>>> | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | > >>>>>> | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | 9.5% | > >>>>>> | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | > >>>>>> | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | > >>>>>> | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | > >>>>>> | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | > >>>>>> > >>>>>> Functional Testing > >>>>>> ================== > >>>>>> > >>>>>> I've build-tested defconfig for all arches supported by tuxmake (which is > >>>>>> most) without issue. > >>>>>> > >>>>>> I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all page > >>>>>> sizes and a few va-sizes, and additionally have run all the mm-selftests, > >>>>>> with no regressions observed vs the equivalent compile-time page size build > >>>>>> (although the mm-selftests have a few existing failures when run against > >>>>>> 16K and 64K kernels - those should really be investigated and fixed > >>>>>> independently). > >>>>>> > >>>>>> Test coverage is lacking for many of the drivers that I've touched, but in > >>>>>> many cases, I'm hoping the changes are simple enough that review might > >>>>>> suffice? > >>>>>> > >>>>>> Performance Testing > >>>>>> =================== > >>>>>> > >>>>>> I've run some limited performance benchmarks: > >>>>>> > >>>>>> First, a real-world benchmark that causes a lot of page table manipulation > >>>>>> (and therefore we would expect to see regression here if we are going to > >>>>>> see it anywhere); kernel compilation. It barely registers a change. Values > >>>>>> are times, > >>>>>> so smaller is better. All relative to base-4k: > >>>>>> | | kern | kern | user | user | real | real | > >>>>>> | > >>>>>> | config | mean | stdev | mean | stdev | mean | stdev | > >>>>>> | > >>>>>> |-------------|---------|---------|---------|---------|---------|---------| > >>>>>> | > >>>>>> | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | > >>>>>> | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | > >>>>>> | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | > >>>>>> > >>>>>> The Speedometer JavaScript benchmark also shows no change. Values are runs > >>>>>> per > >>>>>> min, so bigger is better. All relative to base-4k: > >>>>>> | config | mean | stdev | > >>>>>> | > >>>>>> |-------------|---------|---------| > >>>>>> | > >>>>>> | base-4k | 0.0% | 0.8% | > >>>>>> | compile-4k | 0.4% | 0.8% | > >>>>>> | boot-4k | 0.0% | 0.9% | > >>>>>> > >>>>>> Finally, I've run some microbenchmarks known to stress page table > >>>>>> manipulations (originally from David Hildenbrand). The fork test > >>>>>> maps/allocs 1G of anon memory, then measures the cost of fork(). The munmap > >>>>>> test maps/allocs 1G of anon memory then measures the cost of munmap()ing > >>>>>> it. The fork test is known to be extremely sensitive to any changes that > >>>>>> cause instructions to be aligned differently in cachelines. When using this > >>>>>> test for other changes, I've seen double digit regressions for the > >>>>>> slightest thing, so 12% regression on this test is actually fairly good. > >>>>>> This likely represents the extreme worst case for regressions that will be > >>>>>> observed across other microbenchmarks (famous last > >>>>>> words). Values are times, so smaller is better. All relative to base-4k: > >>>>>> | | fork | fork | munmap | munmap | > >>>>>> | > >>>>>> | config | mean | stdev | stdev | stdev | > >>>>>> | > >>>>>> |-------------|---------|---------|---------|---------| > >>>>>> | > >>>>>> | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | > >>>>>> | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | > >>>>>> | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | > >>>>>> > >>>>>> NOTE: The series applies on top of v6.11. > >>>>>> > >>>>>> Thanks, > >>>>>> Ryan > >>>>>> > >>>>>> > >>>>>> Ryan Roberts (57): > >>>>>> mm: Add macros ahead of supporting boot-time page size selection > >>>>>> vmlinux: Align to PAGE_SIZE_MAX > >>>>>> mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is large > >>>>>> mm/page_alloc: Make page_frag_cache boot-time page size compatible > >>>>>> mm: Avoid split pmd ptl if pmd level is run-time folded > >>>>>> mm: Remove PAGE_SIZE compile-time constant assumption > >>>>>> fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing > >>>>>> fs: Remove PAGE_SIZE compile-time constant assumption > >>>>>> fs/nfs: Remove PAGE_SIZE compile-time constant assumption > >>>>>> fs/ext4: Remove PAGE_SIZE compile-time constant assumption > >>>>>> fork: Permit boot-time THREAD_SIZE determination > >>>>>> cgroup: Remove PAGE_SIZE compile-time constant assumption > >>>>>> bpf: Remove PAGE_SIZE compile-time constant assumption > >>>>>> pm/hibernate: Remove PAGE_SIZE compile-time constant assumption > >>>>>> stackdepot: Remove PAGE_SIZE compile-time constant assumption > >>>>>> perf: Remove PAGE_SIZE compile-time constant assumption > >>>>>> kvm: Remove PAGE_SIZE compile-time constant assumption > >>>>>> trace: Remove PAGE_SIZE compile-time constant assumption > >>>>>> crash: Remove PAGE_SIZE compile-time constant assumption > >>>>>> crypto: Remove PAGE_SIZE compile-time constant assumption > >>>>>> sunrpc: Remove PAGE_SIZE compile-time constant assumption > >>>>>> sound: Remove PAGE_SIZE compile-time constant assumption > >>>>>> net: Remove PAGE_SIZE compile-time constant assumption > >>>>>> net: fec: Remove PAGE_SIZE compile-time constant assumption > >>>>>> net: marvell: Remove PAGE_SIZE compile-time constant assumption > >>>>>> net: hns3: Remove PAGE_SIZE compile-time constant assumption > >>>>>> net: e1000: Remove PAGE_SIZE compile-time constant assumption > >>>>>> net: igbvf: Remove PAGE_SIZE compile-time constant assumption > >>>>>> net: igb: Remove PAGE_SIZE compile-time constant assumption > >>>>>> drivers/base: Remove PAGE_SIZE compile-time constant assumption > >>>>>> edac: Remove PAGE_SIZE compile-time constant assumption > >>>>>> optee: Remove PAGE_SIZE compile-time constant assumption > >>>>>> random: Remove PAGE_SIZE compile-time constant assumption > >>>>>> sata_sil24: Remove PAGE_SIZE compile-time constant assumption > >>>>>> virtio: Remove PAGE_SIZE compile-time constant assumption > >>>>>> xen: Remove PAGE_SIZE compile-time constant assumption > >>>>>> arm64: Fix macros to work in C code in addition to the linker script > >>>>>> arm64: Track early pgtable allocation limit > >>>>>> arm64: Introduce macros required for boot-time page selection > >>>>>> arm64: Refactor early pgtable size calculation macros > >>>>>> arm64: Pass desired page size on command line > >>>>>> arm64: Divorce early init from PAGE_SIZE > >>>>>> arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES > >>>>>> arm64: Align sections to PAGE_SIZE_MAX > >>>>>> arm64: Rework trampoline rodata mapping > >>>>>> arm64: Generalize fixmap for boot-time page size > >>>>>> arm64: Statically allocate and align for worst-case page size > >>>>>> arm64: Convert switch to if for non-const comparison values > >>>>>> arm64: Convert BUILD_BUG_ON to VM_BUG_ON > >>>>>> arm64: Remove PAGE_SZ asm-offset > >>>>>> arm64: Introduce cpu features for page sizes > >>>>>> arm64: Remove PAGE_SIZE from assembly code > >>>>>> arm64: Runtime-fold pmd level > >>>>>> arm64: Support runtime folding in idmap_kpti_install_ng_mappings > >>>>>> arm64: TRAMP_VALIAS is no longer compile-time constant > >>>>>> arm64: Determine THREAD_SIZE at boot-time > >>>>>> arm64: Enable boot-time page size selection > >>>>>> > >>>>>> arch/alpha/include/asm/page.h | 1 + > >>>>>> arch/arc/include/asm/page.h | 1 + > >>>>>> arch/arm/include/asm/page.h | 1 + > >>>>>> arch/arm64/Kconfig | 26 ++- > >>>>>> arch/arm64/include/asm/assembler.h | 78 ++++++- > >>>>>> arch/arm64/include/asm/cpufeature.h | 44 +++- > >>>>>> arch/arm64/include/asm/efi.h | 2 +- > >>>>>> arch/arm64/include/asm/fixmap.h | 28 ++- > >>>>>> arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- > >>>>>> arch/arm64/include/asm/kvm_arm.h | 21 +- > >>>>>> arch/arm64/include/asm/kvm_hyp.h | 11 + > >>>>>> arch/arm64/include/asm/kvm_pgtable.h | 6 +- > >>>>>> arch/arm64/include/asm/memory.h | 62 ++++-- > >>>>>> arch/arm64/include/asm/page-def.h | 3 +- > >>>>>> arch/arm64/include/asm/pgalloc.h | 16 +- > >>>>>> arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ > >>>>>> arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- > >>>>>> arch/arm64/include/asm/pgtable-prot.h | 2 +- > >>>>>> arch/arm64/include/asm/pgtable.h | 133 +++++++++--- > >>>>>> arch/arm64/include/asm/processor.h | 10 +- > >>>>>> arch/arm64/include/asm/sections.h | 1 + > >>>>>> arch/arm64/include/asm/smp.h | 1 + > >>>>>> arch/arm64/include/asm/sparsemem.h | 15 +- > >>>>>> arch/arm64/include/asm/sysreg.h | 54 +++-- > >>>>>> arch/arm64/include/asm/tlb.h | 3 + > >>>>>> arch/arm64/kernel/asm-offsets.c | 4 +- > >>>>>> arch/arm64/kernel/cpufeature.c | 93 ++++++-- > >>>>>> arch/arm64/kernel/efi.c | 2 +- > >>>>>> arch/arm64/kernel/entry.S | 60 +++++- > >>>>>> arch/arm64/kernel/head.S | 46 +++- > >>>>>> arch/arm64/kernel/hibernate-asm.S | 6 +- > >>>>>> arch/arm64/kernel/image-vars.h | 14 ++ > >>>>>> arch/arm64/kernel/image.h | 4 + > >>>>>> arch/arm64/kernel/pi/idreg-override.c | 68 +++++- > >>>>>> arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- > >>>>>> arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- > >>>>>> arch/arm64/kernel/pi/pi.h | 63 +++++- > >>>>>> arch/arm64/kernel/relocate_kernel.S | 10 +- > >>>>>> arch/arm64/kernel/vdso-wrap.S | 4 +- > >>>>>> arch/arm64/kernel/vdso.c | 7 +- > >>>>>> arch/arm64/kernel/vdso/vdso.lds.S | 4 +- > >>>>>> arch/arm64/kernel/vdso32-wrap.S | 4 +- > >>>>>> arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- > >>>>>> arch/arm64/kernel/vmlinux.lds.S | 48 +++-- > >>>>>> arch/arm64/kvm/arm.c | 10 + > >>>>>> arch/arm64/kvm/hyp/nvhe/Makefile | 1 + > >>>>>> arch/arm64/kvm/hyp/nvhe/host.S | 10 +- > >>>>>> arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- > >>>>>> arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ > >>>>>> arch/arm64/kvm/mmu.c | 39 ++-- > >>>>>> arch/arm64/lib/clear_page.S | 7 +- > >>>>>> arch/arm64/lib/copy_page.S | 33 ++- > >>>>>> arch/arm64/lib/mte.S | 27 ++- > >>>>>> arch/arm64/mm/Makefile | 1 + > >>>>>> arch/arm64/mm/fixmap.c | 38 ++-- > >>>>>> arch/arm64/mm/hugetlbpage.c | 40 +--- > >>>>>> arch/arm64/mm/init.c | 26 +-- > >>>>>> arch/arm64/mm/kasan_init.c | 8 +- > >>>>>> arch/arm64/mm/mmu.c | 53 +++-- > >>>>>> arch/arm64/mm/pgd.c | 12 +- > >>>>>> arch/arm64/mm/pgtable-geometry.c | 24 +++ > >>>>>> arch/arm64/mm/proc.S | 128 ++++++++--- > >>>>>> arch/arm64/mm/ptdump.c | 3 +- > >>>>>> arch/arm64/tools/cpucaps | 3 + > >>>>>> arch/csky/include/asm/page.h | 3 + > >>>>>> arch/hexagon/include/asm/page.h | 2 + > >>>>>> arch/loongarch/include/asm/page.h | 2 + > >>>>>> arch/m68k/include/asm/page.h | 1 + > >>>>>> arch/microblaze/include/asm/page.h | 1 + > >>>>>> arch/mips/include/asm/page.h | 1 + > >>>>>> arch/nios2/include/asm/page.h | 2 + > >>>>>> arch/openrisc/include/asm/page.h | 1 + > >>>>>> arch/parisc/include/asm/page.h | 1 + > >>>>>> arch/powerpc/include/asm/page.h | 2 + > >>>>>> arch/riscv/include/asm/page.h | 1 + > >>>>>> arch/s390/include/asm/page.h | 1 + > >>>>>> arch/sh/include/asm/page.h | 1 + > >>>>>> arch/sparc/include/asm/page.h | 3 + > >>>>>> arch/um/include/asm/page.h | 2 + > >>>>>> arch/x86/include/asm/page_types.h | 2 + > >>>>>> arch/xtensa/include/asm/page.h | 1 + > >>>>>> crypto/lskcipher.c | 4 +- > >>>>>> drivers/ata/sata_sil24.c | 46 ++-- > >>>>>> drivers/base/node.c | 6 +- > >>>>>> drivers/base/topology.c | 32 +-- > >>>>>> drivers/block/virtio_blk.c | 2 +- > >>>>>> drivers/char/random.c | 4 +- > >>>>>> drivers/edac/edac_mc.h | 13 +- > >>>>>> drivers/firmware/efi/libstub/arm64.c | 3 +- > >>>>>> drivers/irqchip/irq-gic-v3-its.c | 2 +- > >>>>>> drivers/mtd/mtdswap.c | 4 +- > >>>>>> drivers/net/ethernet/freescale/fec.h | 3 +- > >>>>>> drivers/net/ethernet/freescale/fec_main.c | 5 +- > >>>>>> .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- > >>>>>> drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- > >>>>>> drivers/net/ethernet/intel/igb/igb.h | 25 +-- > >>>>>> drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ > >>>>>> drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- > >>>>>> drivers/net/ethernet/marvell/mvneta.c | 9 +- > >>>>>> drivers/net/ethernet/marvell/sky2.h | 2 +- > >>>>>> drivers/tee/optee/call.c | 7 +- > >>>>>> drivers/tee/optee/smc_abi.c | 2 +- > >>>>>> drivers/virtio/virtio_balloon.c | 10 +- > >>>>>> drivers/xen/balloon.c | 11 +- > >>>>>> drivers/xen/biomerge.c | 12 +- > >>>>>> drivers/xen/privcmd.c | 2 +- > >>>>>> drivers/xen/xenbus/xenbus_client.c | 5 +- > >>>>>> drivers/xen/xlate_mmu.c | 6 +- > >>>>>> fs/binfmt_elf.c | 11 +- > >>>>>> fs/buffer.c | 2 +- > >>>>>> fs/coredump.c | 8 +- > >>>>>> fs/ext4/ext4.h | 36 ++-- > >>>>>> fs/ext4/move_extent.c | 2 +- > >>>>>> fs/ext4/readpage.c | 2 +- > >>>>>> fs/fat/dir.c | 4 +- > >>>>>> fs/fat/fatent.c | 4 +- > >>>>>> fs/nfs/nfs42proc.c | 2 +- > >>>>>> fs/nfs/nfs42xattr.c | 2 +- > >>>>>> fs/nfs/nfs4proc.c | 2 +- > >>>>>> include/asm-generic/pgtable-geometry.h | 71 +++++++ > >>>>>> include/asm-generic/vmlinux.lds.h | 38 ++-- > >>>>>> include/linux/buffer_head.h | 1 + > >>>>>> include/linux/cpumask.h | 5 + > >>>>>> include/linux/linkage.h | 4 +- > >>>>>> include/linux/mm.h | 17 +- > >>>>>> include/linux/mm_types.h | 15 +- > >>>>>> include/linux/mm_types_task.h | 2 +- > >>>>>> include/linux/mmzone.h | 3 +- > >>>>>> include/linux/netlink.h | 6 +- > >>>>>> include/linux/percpu-defs.h | 4 +- > >>>>>> include/linux/perf_event.h | 2 +- > >>>>>> include/linux/sched.h | 4 +- > >>>>>> include/linux/slab.h | 7 +- > >>>>>> include/linux/stackdepot.h | 6 +- > >>>>>> include/linux/sunrpc/svc.h | 8 +- > >>>>>> include/linux/sunrpc/svc_rdma.h | 4 +- > >>>>>> include/linux/sunrpc/svcsock.h | 2 +- > >>>>>> include/linux/swap.h | 17 +- > >>>>>> include/linux/swapops.h | 6 +- > >>>>>> include/linux/thread_info.h | 10 +- > >>>>>> include/xen/page.h | 2 + > >>>>>> init/main.c | 7 +- > >>>>>> kernel/bpf/core.c | 9 +- > >>>>>> kernel/bpf/ringbuf.c | 54 ++--- > >>>>>> kernel/cgroup/cgroup.c | 8 +- > >>>>>> kernel/crash_core.c | 2 +- > >>>>>> kernel/events/core.c | 2 +- > >>>>>> kernel/fork.c | 71 +++---- > >>>>>> kernel/power/power.h | 2 +- > >>>>>> kernel/power/snapshot.c | 2 +- > >>>>>> kernel/power/swap.c | 129 +++++++++-- > >>>>>> kernel/trace/fgraph.c | 2 +- > >>>>>> kernel/trace/trace.c | 2 +- > >>>>>> lib/stackdepot.c | 6 +- > >>>>>> mm/kasan/report.c | 3 +- > >>>>>> mm/memcontrol.c | 11 +- > >>>>>> mm/memory.c | 4 +- > >>>>>> mm/mmap.c | 2 +- > >>>>>> mm/page-writeback.c | 2 +- > >>>>>> mm/page_alloc.c | 31 +-- > >>>>>> mm/slub.c | 2 +- > >>>>>> mm/sparse.c | 2 +- > >>>>>> mm/swapfile.c | 2 +- > >>>>>> mm/vmalloc.c | 7 +- > >>>>>> net/9p/trans_virtio.c | 4 +- > >>>>>> net/core/hotdata.c | 4 +- > >>>>>> net/core/skbuff.c | 4 +- > >>>>>> net/core/sysctl_net_core.c | 2 +- > >>>>>> net/sunrpc/cache.c | 3 +- > >>>>>> net/unix/af_unix.c | 2 +- > >>>>>> sound/soc/soc-utils.c | 4 +- > >>>>>> virt/kvm/kvm_main.c | 2 +- > >>>>>> 172 files changed, 2185 insertions(+), 951 deletions(-) > >>>>>> create mode 100644 arch/arm64/include/asm/pgtable-geometry.h > >>>>>> create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c > >>>>>> create mode 100644 arch/arm64/mm/pgtable-geometry.c > >>>>>> create mode 100644 include/asm-generic/pgtable-geometry.h > >>>>>> > >>>>>> -- > >>>>>> 2.43.0 > >>>>> > >>>>> This is a generally very exciting patch set! I'm looking forward to seeing it > >>>>> land so I can take advantage of it for Fedora ARM and Fedora Asahi Remix. > >>>>> > >>>>> That said, I have a couple of questions: > >>>>> > >>>>> * Going forward, how would we handle drivers/modules that require a particular > >>>>> page size? For example, the Apple Silicon IOMMU driver code requires the > >>>>> kernel to operate in 16k page size mode, and it would need to be disabled in > >>>>> other page sizes. > >>>> > >>>> I think these drivers would want to check PAGE_SIZE at probe time and fail if an > >>>> unsupported page size is in use. Do you see any issue with that? > >>>> > >>>>> > >>>>> * How would we handle an invalid selection at boot? > >>>> > >>>> What do you mean by invalid here? The current policy validates that the > >>>> requested page size is supported by the HW by checking mmfr0. If no page size is > >>>> passed on the command line, or the passed value is not supported by the HW, then > >>>> the we default to the largest page size supported by the HW (so for Apple > >>>> Silicon that would be 16k since the HW doesn't support 64k). Although I think it > >>>> may be better to change that policy to use the smallest page size in this case; > >>>> 4k is the safer bet for compat and will waste much less memory than 64k. > >>>> > >>>>> Can we program in a > >>>>> fallback when the "wrong" mode is selected for a chip or something similar? > >>>> > >>>> Do you mean effectively add a machanism to force 16k if the detected HW is Apple > >>>> Silicon? The trouble is that we need to select the page size, very early in > >>>> boot, before start_kernel() is called, so we really only have generic arch code > >>>> and the command line with which to make the decision. > >>> > >>> Yes... I think a build-time CONFIG for default page size, which can be > >>> overridden by a karg makes sense... Even on platforms like Apple > >>> Silicon you may want to test very specific things in 4k by overriding > >>> with a karg. > >> > >> Ahh, yes, that would certainly work. I'll work it into the next version. > >> > > > > Could we maybe extend to have some kind of way to include a table of > > SoC IDs that certain modes are disabled (e.g. 64k on Apple Silicon) > > 64k is already disabled on Apple Silicon because mmfr0 reports that 64k is not > supported. > > > and preferred modes when no arg is set (16k for Apple Silicon)? That > > And it's not obvious that we should hard-code a page size preference to a SoC > ID. If the CPU can support multiple page sizes, it should be up to the SW stack > to decide, not the SoC. > > I'm guessing your desire is to have a single kernel build that will boot 16k by > default on Apple Silicon and 4k by default on other systems, all without needing > to modify the command line? Personally I think it's cleaner to just require > setting the page size on the command line in these cases. > > > way it'd work something like this: > > > > 1. Table identification of 4/16/64 depending on identified SoC > So I'd prefer not to have this > > > 2. Unidentified ones follow build-time default > > 3. karg forces a mode regardless > But keep these 2. > I think it makes sense to have it, because it's not just Apple Silicon where such a preference/requirement may be necessary. Apple Silicon technically works at 4k, but is completely broken at 4k because Linux cannot do 16k IOMMU with 4k everything else, so being able to at least prefer 16k out of the box is important. And SoCs like the NVIDIA Grace Hopper platform prefer 64k over other options (though I am unaware of a gross incompatibility that effectively requires it like Apple Silicon has). When we're trying to get to "single generic image that works everywhere", stuff like this matters and I would really like you to consider it from the lens of "we want things to work as automagic as they do on x86". -- 真実はいつも一つ!/ Always, there's only one truth!
Neal Gompa 於 2024/10/22 下午5:33 寫道: > On Mon, Oct 21, 2024 at 11:02 AM Ryan Roberts <ryan.roberts@arm.com> wrote: >> >> On 21/10/2024 14:49, Neal Gompa wrote: >>> On Mon, Oct 21, 2024 at 7:51 AM Ryan Roberts <ryan.roberts@arm.com> wrote: >>>> >>>> On 21/10/2024 12:32, Eric Curtin wrote: >>>>> On Mon, 21 Oct 2024 at 12:09, Ryan Roberts <ryan.roberts@arm.com> wrote: >>>>>> >>>>>> On 19/10/2024 16:47, Neal Gompa wrote: >>>>>>> On Monday, October 14, 2024 6:55:11 AM EDT Ryan Roberts wrote: >>>>>>>> Hi All, >>>>>>>> >>>>>>>> Patch bomb incoming... This covers many subsystems, so I've included a core >>>>>>>> set of people on the full series and additionally included maintainers on >>>>>>>> relevant patches. I haven't included those maintainers on this cover letter >>>>>>>> since the numbers were far too big for it to work. But I've included a link >>>>>>>> to this cover letter on each patch, so they can hopefully find their way >>>>>>>> here. For follow up submissions I'll break it up by subsystem, but for now >>>>>>>> thought it was important to show the full picture. >>>>>>>> >>>>>>>> This RFC series implements support for boot-time page size selection within >>>>>>>> the arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to >>>>>>>> date, page size has been selected at compile-time, meaning the size is >>>>>>>> baked into a given kernel image. As use of larger-than-4K page sizes become >>>>>>>> more prevalent this starts to present a problem for distributions. >>>>>>>> Boot-time page size selection enables the creation of a single kernel >>>>>>>> image, which can be told which page size to use on the kernel command line. >>>>>>>> >>>>>>>> Why is having an image-per-page size problematic? >>>>>>>> ================================================= >>>>>>>> >>>>>>>> Many traditional distros are now supporting both 4K and 64K. And this means >>>>>>>> managing 2 kernel packages, along with drivers for each. For some, it means >>>>>>>> multiple installer flavours and multiple ISOs. All of this adds up to a >>>>>>>> less-than-ideal level of complexity. Additionally, Android now supports 4K >>>>>>>> and 16K kernels. I'm told having to explicitly manage their KABI for each >>>>>>>> kernel is painful, and the extra flash space required for both kernel >>>>>>>> images and the duplicated modules has been problematic. Boot-time page size >>>>>>>> selection solves all of this. >>>>>>>> >>>>>>>> Additionally, in starting to think about the longer term deployment story >>>>>>>> for D128 page tables, which Arm architecture now supports, a lot of the >>>>>>>> same problems need to be solved, so this work sets us up nicely for that. >>>>>>>> >>>>>>>> So what's the down side? >>>>>>>> ======================== >>>>>>>> >>>>>>>> Well nothing's free; Various static allocations in the kernel image must be >>>>>>>> sized for the worst case (largest supported page size), so image size is in >>>>>>>> line with size of 64K compile-time image. So if you're interested in 4K or >>>>>>>> 16K, there is a slight increase to the image size. But I expect that >>>>>>>> problem goes away if you're compressing the image - its just some extra >>>>>>>> zeros. At boot-time, I expect we could free the unused static storage once >>>>>>>> we know the page size - although that would be a follow up enhancement. >>>>>>>> >>>>>>>> And then there is performance. Since PAGE_SIZE and friends are no longer >>>>>>>> compile-time constants, we must look up their values and do arithmetic at >>>>>>>> runtime instead of compile-time. My early perf testing suggests this is >>>>>>>> inperceptible for real-world workloads, and only has small impact on >>>>>>>> microbenchmarks - more on this below. >>>>>>>> >>>>>>>> Approach >>>>>>>> ======== >>>>>>>> >>>>>>>> The basic idea is to rid the source of any assumptions that PAGE_SIZE and >>>>>>>> friends are compile-time constant, but in a way that allows the compiler to >>>>>>>> perform the same optimizations as was previously being done if they do turn >>>>>>>> out to be compile-time constant. Where constants are required, we use >>>>>>>> limits; PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full >>>>>>>> description of all the classes of problems to solve. >>>>>>>> >>>>>>>> By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to >>>>>>>> boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. >>>>>>>> arm64 does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE >>>>>>>> Kconfig, which is an alternative to selecting a compile-time page size. >>>>>>>> >>>>>>>> When boot-time page size is active, the arch pgtable geometry macro >>>>>>>> definitions resolve to something that can be configured at boot. The arm64 >>>>>>>> implementation in this series mainly uses global, __ro_after_init >>>>>>>> variables. I've tried using alternatives patching, but that performs worse >>>>>>>> than loading from memory; I think due to code size bloat. >>>>>>>> >>>>>>>> Status >>>>>>>> ====== >>>>>>>> >>>>>>>> When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only implemented >>>>>>>> enough to compile the kernel image itself with defconfig (and a few other >>>>>>>> bits and pieces). This is enough to build a kernel that can boot under QEMU >>>>>>>> or FVP. I'll happily do the rest of the work to enable all the extra >>>>>>>> drivers, but wanted to get feedback on the shape of this effort first. If >>>>>>>> anyone wants to do any testing, and has a must-have config, let me know and >>>>>>>> I'll prioritize enabling it first. >>>>>>>> >>>>>>>> The series is arranged as follows: >>>>>>>> >>>>>>>> - patch 1: Add macros required for converting non-arch code to support >>>>>>>> boot-time page size selection >>>>>>>> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from >>>>>>>> all non-arch code >>>>>>>> - patches 37-38: Some arm64 tidy ups >>>>>>>> - patch 39: Add macros required for converting arm64 code to >>>>>>> support >>>>>>>> boot-time page size selection >>>>>>>> - patches 40-56: arm64 changes to support boot-time page size selection >>>>>>>> - patch 57: Add arm64 Kconfig option to enable boot-time page >>>>>>> size >>>>>>>> selection >>>>>>>> >>>>>>>> Ideally, I'd like to get the basics merged (something like this series), >>>>>>>> then incrementally improve it over a handful of kernel releases until we >>>>>>>> can demonstrate that we have feature parity with the compile-time build and >>>>>>>> no performance blockers. Once at that point, ideally the compile-time build >>>>>>>> options would be removed and the code could be cleaned up further. >>>>>>>> >>>>>>>> One of the bigger peices that I'd propose to add as a follow up, is to make >>>>>>>> va-size boot-time selectable too. That will greatly simplify LPA2 fallback >>>>>>>> handling. >>>>>>>> >>>>>>>> Assuming people are ammenable to the rough shape, how would I go about >>>>>>>> getting the non-arch changes merged? Since they cover many subsystems, will >>>>>>>> each piece need to go independently to each relevant maintainer or could it >>>>>>>> all be merged together through the arm64 tree? >>>>>>>> >>>>>>>> Image Size >>>>>>>> ========== >>>>>>>> >>>>>>>> The below shows the size of a defconfig (+ xfs, squashfs, ftrace, kprobes) >>>>>>>> kernel image on disk for base (before any changes applied), compile (with >>>>>>>> changes, configured for compile-time page size) and boot (with changes, >>>>>>>> configured for boot-time page size). >>>>>>>> >>>>>>>> You can see the that compile-16k and 64k configs are actually slightly >>>>>>>> smaller than the baselines; that's due to optimizing some buffer sizes >>>>>>>> which didn't need to depend on page size during the series. The boot-time >>>>>>>> image is ~1% bigger than the 64k compile-time image. I believe there is >>>>>>>> scope to improve this to make it >>>>>>>> equal to compile-64k if required: >>>>>>>> | config | size/KB | diff/KB | diff/% | >>>>>>>> | >>>>>>>> |-------------|---------|---------|---------| >>>>>>>> | >>>>>>>> | base-4k | 54895 | 0 | 0.0% | >>>>>>>> | base-16k | 55161 | 266 | 0.5% | >>>>>>>> | base-64k | 56775 | 1880 | 3.4% | >>>>>>>> | compile-4k | 54895 | 0 | 0.0% | >>>>>>>> | compile-16k | 55097 | 202 | 0.4% | >>>>>>>> | compile-64k | 56391 | 1496 | 2.7% | >>>>>>>> | boot-4K | 57045 | 2150 | 3.9% | >>>>>>>> >>>>>>>> And below shows the size of the image in memory at run-time, separated for >>>>>>>> text and data costs. The boot image has ~1% text cost; most likely due to >>>>>>>> the fact that PAGE_SIZE and friends are not compile-time constants so need >>>>>>>> instructions to load the values and do arithmetic. I believe we could >>>>>>>> eventually get the data cost to match the cost for the compile image for >>>>>>>> the chosen page size by freeing >>>>>>>> the ends of the static buffers not needed for the selected page size: >>>>>>>> | | text | text | text | data | data | data | >>>>>>>> | >>>>>>>> | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | diff/% | >>>>>>>> | >>>>>>>> |-------------|---------|---------|---------|---------|---------|---------| >>>>>>>> | >>>>>>>> | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | >>>>>>>> | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | >>>>>>>> | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | 9.5% | >>>>>>>> | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | >>>>>>>> | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | >>>>>>>> | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | >>>>>>>> | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | >>>>>>>> >>>>>>>> Functional Testing >>>>>>>> ================== >>>>>>>> >>>>>>>> I've build-tested defconfig for all arches supported by tuxmake (which is >>>>>>>> most) without issue. >>>>>>>> >>>>>>>> I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all page >>>>>>>> sizes and a few va-sizes, and additionally have run all the mm-selftests, >>>>>>>> with no regressions observed vs the equivalent compile-time page size build >>>>>>>> (although the mm-selftests have a few existing failures when run against >>>>>>>> 16K and 64K kernels - those should really be investigated and fixed >>>>>>>> independently). >>>>>>>> >>>>>>>> Test coverage is lacking for many of the drivers that I've touched, but in >>>>>>>> many cases, I'm hoping the changes are simple enough that review might >>>>>>>> suffice? >>>>>>>> >>>>>>>> Performance Testing >>>>>>>> =================== >>>>>>>> >>>>>>>> I've run some limited performance benchmarks: >>>>>>>> >>>>>>>> First, a real-world benchmark that causes a lot of page table manipulation >>>>>>>> (and therefore we would expect to see regression here if we are going to >>>>>>>> see it anywhere); kernel compilation. It barely registers a change. Values >>>>>>>> are times, >>>>>>>> so smaller is better. All relative to base-4k: >>>>>>>> | | kern | kern | user | user | real | real | >>>>>>>> | >>>>>>>> | config | mean | stdev | mean | stdev | mean | stdev | >>>>>>>> | >>>>>>>> |-------------|---------|---------|---------|---------|---------|---------| >>>>>>>> | >>>>>>>> | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | >>>>>>>> | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | >>>>>>>> | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | >>>>>>>> >>>>>>>> The Speedometer JavaScript benchmark also shows no change. Values are runs >>>>>>>> per >>>>>>>> min, so bigger is better. All relative to base-4k: >>>>>>>> | config | mean | stdev | >>>>>>>> | >>>>>>>> |-------------|---------|---------| >>>>>>>> | >>>>>>>> | base-4k | 0.0% | 0.8% | >>>>>>>> | compile-4k | 0.4% | 0.8% | >>>>>>>> | boot-4k | 0.0% | 0.9% | >>>>>>>> >>>>>>>> Finally, I've run some microbenchmarks known to stress page table >>>>>>>> manipulations (originally from David Hildenbrand). The fork test >>>>>>>> maps/allocs 1G of anon memory, then measures the cost of fork(). The munmap >>>>>>>> test maps/allocs 1G of anon memory then measures the cost of munmap()ing >>>>>>>> it. The fork test is known to be extremely sensitive to any changes that >>>>>>>> cause instructions to be aligned differently in cachelines. When using this >>>>>>>> test for other changes, I've seen double digit regressions for the >>>>>>>> slightest thing, so 12% regression on this test is actually fairly good. >>>>>>>> This likely represents the extreme worst case for regressions that will be >>>>>>>> observed across other microbenchmarks (famous last >>>>>>>> words). Values are times, so smaller is better. All relative to base-4k: >>>>>>>> | | fork | fork | munmap | munmap | >>>>>>>> | >>>>>>>> | config | mean | stdev | stdev | stdev | >>>>>>>> | >>>>>>>> |-------------|---------|---------|---------|---------| >>>>>>>> | >>>>>>>> | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | >>>>>>>> | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | >>>>>>>> | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | >>>>>>>> >>>>>>>> NOTE: The series applies on top of v6.11. >>>>>>>> >>>>>>>> Thanks, >>>>>>>> Ryan >>>>>>>> >>>>>>>> >>>>>>>> Ryan Roberts (57): >>>>>>>> mm: Add macros ahead of supporting boot-time page size selection >>>>>>>> vmlinux: Align to PAGE_SIZE_MAX >>>>>>>> mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is large >>>>>>>> mm/page_alloc: Make page_frag_cache boot-time page size compatible >>>>>>>> mm: Avoid split pmd ptl if pmd level is run-time folded >>>>>>>> mm: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing >>>>>>>> fs: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> fs/nfs: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> fs/ext4: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> fork: Permit boot-time THREAD_SIZE determination >>>>>>>> cgroup: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> bpf: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> pm/hibernate: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> stackdepot: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> perf: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> kvm: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> trace: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> crash: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> crypto: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> sunrpc: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> sound: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> net: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> net: fec: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> net: marvell: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> net: hns3: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> net: e1000: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> net: igbvf: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> net: igb: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> drivers/base: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> edac: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> optee: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> random: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> sata_sil24: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> virtio: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> xen: Remove PAGE_SIZE compile-time constant assumption >>>>>>>> arm64: Fix macros to work in C code in addition to the linker script >>>>>>>> arm64: Track early pgtable allocation limit >>>>>>>> arm64: Introduce macros required for boot-time page selection >>>>>>>> arm64: Refactor early pgtable size calculation macros >>>>>>>> arm64: Pass desired page size on command line >>>>>>>> arm64: Divorce early init from PAGE_SIZE >>>>>>>> arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES >>>>>>>> arm64: Align sections to PAGE_SIZE_MAX >>>>>>>> arm64: Rework trampoline rodata mapping >>>>>>>> arm64: Generalize fixmap for boot-time page size >>>>>>>> arm64: Statically allocate and align for worst-case page size >>>>>>>> arm64: Convert switch to if for non-const comparison values >>>>>>>> arm64: Convert BUILD_BUG_ON to VM_BUG_ON >>>>>>>> arm64: Remove PAGE_SZ asm-offset >>>>>>>> arm64: Introduce cpu features for page sizes >>>>>>>> arm64: Remove PAGE_SIZE from assembly code >>>>>>>> arm64: Runtime-fold pmd level >>>>>>>> arm64: Support runtime folding in idmap_kpti_install_ng_mappings >>>>>>>> arm64: TRAMP_VALIAS is no longer compile-time constant >>>>>>>> arm64: Determine THREAD_SIZE at boot-time >>>>>>>> arm64: Enable boot-time page size selection >>>>>>>> >>>>>>>> arch/alpha/include/asm/page.h | 1 + >>>>>>>> arch/arc/include/asm/page.h | 1 + >>>>>>>> arch/arm/include/asm/page.h | 1 + >>>>>>>> arch/arm64/Kconfig | 26 ++- >>>>>>>> arch/arm64/include/asm/assembler.h | 78 ++++++- >>>>>>>> arch/arm64/include/asm/cpufeature.h | 44 +++- >>>>>>>> arch/arm64/include/asm/efi.h | 2 +- >>>>>>>> arch/arm64/include/asm/fixmap.h | 28 ++- >>>>>>>> arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- >>>>>>>> arch/arm64/include/asm/kvm_arm.h | 21 +- >>>>>>>> arch/arm64/include/asm/kvm_hyp.h | 11 + >>>>>>>> arch/arm64/include/asm/kvm_pgtable.h | 6 +- >>>>>>>> arch/arm64/include/asm/memory.h | 62 ++++-- >>>>>>>> arch/arm64/include/asm/page-def.h | 3 +- >>>>>>>> arch/arm64/include/asm/pgalloc.h | 16 +- >>>>>>>> arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ >>>>>>>> arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- >>>>>>>> arch/arm64/include/asm/pgtable-prot.h | 2 +- >>>>>>>> arch/arm64/include/asm/pgtable.h | 133 +++++++++--- >>>>>>>> arch/arm64/include/asm/processor.h | 10 +- >>>>>>>> arch/arm64/include/asm/sections.h | 1 + >>>>>>>> arch/arm64/include/asm/smp.h | 1 + >>>>>>>> arch/arm64/include/asm/sparsemem.h | 15 +- >>>>>>>> arch/arm64/include/asm/sysreg.h | 54 +++-- >>>>>>>> arch/arm64/include/asm/tlb.h | 3 + >>>>>>>> arch/arm64/kernel/asm-offsets.c | 4 +- >>>>>>>> arch/arm64/kernel/cpufeature.c | 93 ++++++-- >>>>>>>> arch/arm64/kernel/efi.c | 2 +- >>>>>>>> arch/arm64/kernel/entry.S | 60 +++++- >>>>>>>> arch/arm64/kernel/head.S | 46 +++- >>>>>>>> arch/arm64/kernel/hibernate-asm.S | 6 +- >>>>>>>> arch/arm64/kernel/image-vars.h | 14 ++ >>>>>>>> arch/arm64/kernel/image.h | 4 + >>>>>>>> arch/arm64/kernel/pi/idreg-override.c | 68 +++++- >>>>>>>> arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- >>>>>>>> arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- >>>>>>>> arch/arm64/kernel/pi/pi.h | 63 +++++- >>>>>>>> arch/arm64/kernel/relocate_kernel.S | 10 +- >>>>>>>> arch/arm64/kernel/vdso-wrap.S | 4 +- >>>>>>>> arch/arm64/kernel/vdso.c | 7 +- >>>>>>>> arch/arm64/kernel/vdso/vdso.lds.S | 4 +- >>>>>>>> arch/arm64/kernel/vdso32-wrap.S | 4 +- >>>>>>>> arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- >>>>>>>> arch/arm64/kernel/vmlinux.lds.S | 48 +++-- >>>>>>>> arch/arm64/kvm/arm.c | 10 + >>>>>>>> arch/arm64/kvm/hyp/nvhe/Makefile | 1 + >>>>>>>> arch/arm64/kvm/hyp/nvhe/host.S | 10 +- >>>>>>>> arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- >>>>>>>> arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ >>>>>>>> arch/arm64/kvm/mmu.c | 39 ++-- >>>>>>>> arch/arm64/lib/clear_page.S | 7 +- >>>>>>>> arch/arm64/lib/copy_page.S | 33 ++- >>>>>>>> arch/arm64/lib/mte.S | 27 ++- >>>>>>>> arch/arm64/mm/Makefile | 1 + >>>>>>>> arch/arm64/mm/fixmap.c | 38 ++-- >>>>>>>> arch/arm64/mm/hugetlbpage.c | 40 +--- >>>>>>>> arch/arm64/mm/init.c | 26 +-- >>>>>>>> arch/arm64/mm/kasan_init.c | 8 +- >>>>>>>> arch/arm64/mm/mmu.c | 53 +++-- >>>>>>>> arch/arm64/mm/pgd.c | 12 +- >>>>>>>> arch/arm64/mm/pgtable-geometry.c | 24 +++ >>>>>>>> arch/arm64/mm/proc.S | 128 ++++++++--- >>>>>>>> arch/arm64/mm/ptdump.c | 3 +- >>>>>>>> arch/arm64/tools/cpucaps | 3 + >>>>>>>> arch/csky/include/asm/page.h | 3 + >>>>>>>> arch/hexagon/include/asm/page.h | 2 + >>>>>>>> arch/loongarch/include/asm/page.h | 2 + >>>>>>>> arch/m68k/include/asm/page.h | 1 + >>>>>>>> arch/microblaze/include/asm/page.h | 1 + >>>>>>>> arch/mips/include/asm/page.h | 1 + >>>>>>>> arch/nios2/include/asm/page.h | 2 + >>>>>>>> arch/openrisc/include/asm/page.h | 1 + >>>>>>>> arch/parisc/include/asm/page.h | 1 + >>>>>>>> arch/powerpc/include/asm/page.h | 2 + >>>>>>>> arch/riscv/include/asm/page.h | 1 + >>>>>>>> arch/s390/include/asm/page.h | 1 + >>>>>>>> arch/sh/include/asm/page.h | 1 + >>>>>>>> arch/sparc/include/asm/page.h | 3 + >>>>>>>> arch/um/include/asm/page.h | 2 + >>>>>>>> arch/x86/include/asm/page_types.h | 2 + >>>>>>>> arch/xtensa/include/asm/page.h | 1 + >>>>>>>> crypto/lskcipher.c | 4 +- >>>>>>>> drivers/ata/sata_sil24.c | 46 ++-- >>>>>>>> drivers/base/node.c | 6 +- >>>>>>>> drivers/base/topology.c | 32 +-- >>>>>>>> drivers/block/virtio_blk.c | 2 +- >>>>>>>> drivers/char/random.c | 4 +- >>>>>>>> drivers/edac/edac_mc.h | 13 +- >>>>>>>> drivers/firmware/efi/libstub/arm64.c | 3 +- >>>>>>>> drivers/irqchip/irq-gic-v3-its.c | 2 +- >>>>>>>> drivers/mtd/mtdswap.c | 4 +- >>>>>>>> drivers/net/ethernet/freescale/fec.h | 3 +- >>>>>>>> drivers/net/ethernet/freescale/fec_main.c | 5 +- >>>>>>>> .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- >>>>>>>> drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- >>>>>>>> drivers/net/ethernet/intel/igb/igb.h | 25 +-- >>>>>>>> drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ >>>>>>>> drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- >>>>>>>> drivers/net/ethernet/marvell/mvneta.c | 9 +- >>>>>>>> drivers/net/ethernet/marvell/sky2.h | 2 +- >>>>>>>> drivers/tee/optee/call.c | 7 +- >>>>>>>> drivers/tee/optee/smc_abi.c | 2 +- >>>>>>>> drivers/virtio/virtio_balloon.c | 10 +- >>>>>>>> drivers/xen/balloon.c | 11 +- >>>>>>>> drivers/xen/biomerge.c | 12 +- >>>>>>>> drivers/xen/privcmd.c | 2 +- >>>>>>>> drivers/xen/xenbus/xenbus_client.c | 5 +- >>>>>>>> drivers/xen/xlate_mmu.c | 6 +- >>>>>>>> fs/binfmt_elf.c | 11 +- >>>>>>>> fs/buffer.c | 2 +- >>>>>>>> fs/coredump.c | 8 +- >>>>>>>> fs/ext4/ext4.h | 36 ++-- >>>>>>>> fs/ext4/move_extent.c | 2 +- >>>>>>>> fs/ext4/readpage.c | 2 +- >>>>>>>> fs/fat/dir.c | 4 +- >>>>>>>> fs/fat/fatent.c | 4 +- >>>>>>>> fs/nfs/nfs42proc.c | 2 +- >>>>>>>> fs/nfs/nfs42xattr.c | 2 +- >>>>>>>> fs/nfs/nfs4proc.c | 2 +- >>>>>>>> include/asm-generic/pgtable-geometry.h | 71 +++++++ >>>>>>>> include/asm-generic/vmlinux.lds.h | 38 ++-- >>>>>>>> include/linux/buffer_head.h | 1 + >>>>>>>> include/linux/cpumask.h | 5 + >>>>>>>> include/linux/linkage.h | 4 +- >>>>>>>> include/linux/mm.h | 17 +- >>>>>>>> include/linux/mm_types.h | 15 +- >>>>>>>> include/linux/mm_types_task.h | 2 +- >>>>>>>> include/linux/mmzone.h | 3 +- >>>>>>>> include/linux/netlink.h | 6 +- >>>>>>>> include/linux/percpu-defs.h | 4 +- >>>>>>>> include/linux/perf_event.h | 2 +- >>>>>>>> include/linux/sched.h | 4 +- >>>>>>>> include/linux/slab.h | 7 +- >>>>>>>> include/linux/stackdepot.h | 6 +- >>>>>>>> include/linux/sunrpc/svc.h | 8 +- >>>>>>>> include/linux/sunrpc/svc_rdma.h | 4 +- >>>>>>>> include/linux/sunrpc/svcsock.h | 2 +- >>>>>>>> include/linux/swap.h | 17 +- >>>>>>>> include/linux/swapops.h | 6 +- >>>>>>>> include/linux/thread_info.h | 10 +- >>>>>>>> include/xen/page.h | 2 + >>>>>>>> init/main.c | 7 +- >>>>>>>> kernel/bpf/core.c | 9 +- >>>>>>>> kernel/bpf/ringbuf.c | 54 ++--- >>>>>>>> kernel/cgroup/cgroup.c | 8 +- >>>>>>>> kernel/crash_core.c | 2 +- >>>>>>>> kernel/events/core.c | 2 +- >>>>>>>> kernel/fork.c | 71 +++---- >>>>>>>> kernel/power/power.h | 2 +- >>>>>>>> kernel/power/snapshot.c | 2 +- >>>>>>>> kernel/power/swap.c | 129 +++++++++-- >>>>>>>> kernel/trace/fgraph.c | 2 +- >>>>>>>> kernel/trace/trace.c | 2 +- >>>>>>>> lib/stackdepot.c | 6 +- >>>>>>>> mm/kasan/report.c | 3 +- >>>>>>>> mm/memcontrol.c | 11 +- >>>>>>>> mm/memory.c | 4 +- >>>>>>>> mm/mmap.c | 2 +- >>>>>>>> mm/page-writeback.c | 2 +- >>>>>>>> mm/page_alloc.c | 31 +-- >>>>>>>> mm/slub.c | 2 +- >>>>>>>> mm/sparse.c | 2 +- >>>>>>>> mm/swapfile.c | 2 +- >>>>>>>> mm/vmalloc.c | 7 +- >>>>>>>> net/9p/trans_virtio.c | 4 +- >>>>>>>> net/core/hotdata.c | 4 +- >>>>>>>> net/core/skbuff.c | 4 +- >>>>>>>> net/core/sysctl_net_core.c | 2 +- >>>>>>>> net/sunrpc/cache.c | 3 +- >>>>>>>> net/unix/af_unix.c | 2 +- >>>>>>>> sound/soc/soc-utils.c | 4 +- >>>>>>>> virt/kvm/kvm_main.c | 2 +- >>>>>>>> 172 files changed, 2185 insertions(+), 951 deletions(-) >>>>>>>> create mode 100644 arch/arm64/include/asm/pgtable-geometry.h >>>>>>>> create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c >>>>>>>> create mode 100644 arch/arm64/mm/pgtable-geometry.c >>>>>>>> create mode 100644 include/asm-generic/pgtable-geometry.h >>>>>>>> >>>>>>>> -- >>>>>>>> 2.43.0 >>>>>>> >>>>>>> This is a generally very exciting patch set! I'm looking forward to seeing it >>>>>>> land so I can take advantage of it for Fedora ARM and Fedora Asahi Remix. >>>>>>> >>>>>>> That said, I have a couple of questions: >>>>>>> >>>>>>> * Going forward, how would we handle drivers/modules that require a particular >>>>>>> page size? For example, the Apple Silicon IOMMU driver code requires the >>>>>>> kernel to operate in 16k page size mode, and it would need to be disabled in >>>>>>> other page sizes. >>>>>> >>>>>> I think these drivers would want to check PAGE_SIZE at probe time and fail if an >>>>>> unsupported page size is in use. Do you see any issue with that? >>>>>> >>>>>>> >>>>>>> * How would we handle an invalid selection at boot? >>>>>> >>>>>> What do you mean by invalid here? The current policy validates that the >>>>>> requested page size is supported by the HW by checking mmfr0. If no page size is >>>>>> passed on the command line, or the passed value is not supported by the HW, then >>>>>> the we default to the largest page size supported by the HW (so for Apple >>>>>> Silicon that would be 16k since the HW doesn't support 64k). Although I think it >>>>>> may be better to change that policy to use the smallest page size in this case; >>>>>> 4k is the safer bet for compat and will waste much less memory than 64k. >>>>>> >>>>>>> Can we program in a >>>>>>> fallback when the "wrong" mode is selected for a chip or something similar? >>>>>> >>>>>> Do you mean effectively add a machanism to force 16k if the detected HW is Apple >>>>>> Silicon? The trouble is that we need to select the page size, very early in >>>>>> boot, before start_kernel() is called, so we really only have generic arch code >>>>>> and the command line with which to make the decision. >>>>> >>>>> Yes... I think a build-time CONFIG for default page size, which can be >>>>> overridden by a karg makes sense... Even on platforms like Apple >>>>> Silicon you may want to test very specific things in 4k by overriding >>>>> with a karg. >>>> >>>> Ahh, yes, that would certainly work. I'll work it into the next version. >>>> >>> >>> Could we maybe extend to have some kind of way to include a table of >>> SoC IDs that certain modes are disabled (e.g. 64k on Apple Silicon) >> >> 64k is already disabled on Apple Silicon because mmfr0 reports that 64k is not >> supported. >> >>> and preferred modes when no arg is set (16k for Apple Silicon)? That >> >> And it's not obvious that we should hard-code a page size preference to a SoC >> ID. If the CPU can support multiple page sizes, it should be up to the SW stack >> to decide, not the SoC. >> >> I'm guessing your desire is to have a single kernel build that will boot 16k by >> default on Apple Silicon and 4k by default on other systems, all without needing >> to modify the command line? Personally I think it's cleaner to just require >> setting the page size on the command line in these cases. >> >>> way it'd work something like this: >>> >>> 1. Table identification of 4/16/64 depending on identified SoC >> So I'd prefer not to have this >> >>> 2. Unidentified ones follow build-time default >>> 3. karg forces a mode regardless >> But keep these 2. >> > Since we are talking about Apple Silicon and page size, I would like to add that on the Apple Silicon SoCs I am working on, the situation is like this: Apple A7 (s5l8960x), A8 (T7000), A8X (T7001): CPU MMU support 4K and 64K page sizes. Apple A9 (s8000/s8003), A9X (s8001), A10 (t8010), A10X (t8011), A11 (t8015): CPU MMU Support 16K and 64K page sizes. However, all of them have 4K page DART IOMMUs. > I think it makes sense to have it, because it's not just Apple Silicon > where such a preference/requirement may be necessary. Apple Silicon > technically works at 4k, but is completely broken at 4k because Linux > cannot do 16k IOMMU with 4k everything else, so being able to at least > prefer 16k out of the box is important. And SoCs like the NVIDIA Grace > Hopper platform prefer 64k over other options (though I am unaware of > a gross incompatibility that effectively requires it like Apple > Silicon has). > > When we're trying to get to "single generic image that works > everywhere", stuff like this matters and I would really like you to > consider it from the lens of "we want things to work as automagic as > they do on x86". For me, in order to get to this level of automagic, there do need to be a table of which SoC should use which page size table. > > Nick Chan
On 22/10/2024 16:03, Nick Chan wrote: > > > Neal Gompa 於 2024/10/22 下午5:33 寫道: >> On Mon, Oct 21, 2024 at 11:02 AM Ryan Roberts <ryan.roberts@arm.com> wrote: >>> >>> On 21/10/2024 14:49, Neal Gompa wrote: >>>> On Mon, Oct 21, 2024 at 7:51 AM Ryan Roberts <ryan.roberts@arm.com> wrote: >>>>> >>>>> On 21/10/2024 12:32, Eric Curtin wrote: >>>>>> On Mon, 21 Oct 2024 at 12:09, Ryan Roberts <ryan.roberts@arm.com> wrote: >>>>>>> >>>>>>> On 19/10/2024 16:47, Neal Gompa wrote: >>>>>>>> On Monday, October 14, 2024 6:55:11 AM EDT Ryan Roberts wrote: >>>>>>>>> Hi All, >>>>>>>>> >>>>>>>>> Patch bomb incoming... This covers many subsystems, so I've included a core >>>>>>>>> set of people on the full series and additionally included maintainers on >>>>>>>>> relevant patches. I haven't included those maintainers on this cover letter >>>>>>>>> since the numbers were far too big for it to work. But I've included a link >>>>>>>>> to this cover letter on each patch, so they can hopefully find their way >>>>>>>>> here. For follow up submissions I'll break it up by subsystem, but for now >>>>>>>>> thought it was important to show the full picture. >>>>>>>>> >>>>>>>>> This RFC series implements support for boot-time page size selection within >>>>>>>>> the arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to >>>>>>>>> date, page size has been selected at compile-time, meaning the size is >>>>>>>>> baked into a given kernel image. As use of larger-than-4K page sizes become >>>>>>>>> more prevalent this starts to present a problem for distributions. >>>>>>>>> Boot-time page size selection enables the creation of a single kernel >>>>>>>>> image, which can be told which page size to use on the kernel command line. >>>>>>>>> >>>>>>>>> Why is having an image-per-page size problematic? >>>>>>>>> ================================================= >>>>>>>>> >>>>>>>>> Many traditional distros are now supporting both 4K and 64K. And this means >>>>>>>>> managing 2 kernel packages, along with drivers for each. For some, it means >>>>>>>>> multiple installer flavours and multiple ISOs. All of this adds up to a >>>>>>>>> less-than-ideal level of complexity. Additionally, Android now supports 4K >>>>>>>>> and 16K kernels. I'm told having to explicitly manage their KABI for each >>>>>>>>> kernel is painful, and the extra flash space required for both kernel >>>>>>>>> images and the duplicated modules has been problematic. Boot-time page size >>>>>>>>> selection solves all of this. >>>>>>>>> >>>>>>>>> Additionally, in starting to think about the longer term deployment story >>>>>>>>> for D128 page tables, which Arm architecture now supports, a lot of the >>>>>>>>> same problems need to be solved, so this work sets us up nicely for that. >>>>>>>>> >>>>>>>>> So what's the down side? >>>>>>>>> ======================== >>>>>>>>> >>>>>>>>> Well nothing's free; Various static allocations in the kernel image must be >>>>>>>>> sized for the worst case (largest supported page size), so image size is in >>>>>>>>> line with size of 64K compile-time image. So if you're interested in 4K or >>>>>>>>> 16K, there is a slight increase to the image size. But I expect that >>>>>>>>> problem goes away if you're compressing the image - its just some extra >>>>>>>>> zeros. At boot-time, I expect we could free the unused static storage once >>>>>>>>> we know the page size - although that would be a follow up enhancement. >>>>>>>>> >>>>>>>>> And then there is performance. Since PAGE_SIZE and friends are no longer >>>>>>>>> compile-time constants, we must look up their values and do arithmetic at >>>>>>>>> runtime instead of compile-time. My early perf testing suggests this is >>>>>>>>> inperceptible for real-world workloads, and only has small impact on >>>>>>>>> microbenchmarks - more on this below. >>>>>>>>> >>>>>>>>> Approach >>>>>>>>> ======== >>>>>>>>> >>>>>>>>> The basic idea is to rid the source of any assumptions that PAGE_SIZE and >>>>>>>>> friends are compile-time constant, but in a way that allows the compiler to >>>>>>>>> perform the same optimizations as was previously being done if they do turn >>>>>>>>> out to be compile-time constant. Where constants are required, we use >>>>>>>>> limits; PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full >>>>>>>>> description of all the classes of problems to solve. >>>>>>>>> >>>>>>>>> By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to >>>>>>>>> boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. >>>>>>>>> arm64 does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE >>>>>>>>> Kconfig, which is an alternative to selecting a compile-time page size. >>>>>>>>> >>>>>>>>> When boot-time page size is active, the arch pgtable geometry macro >>>>>>>>> definitions resolve to something that can be configured at boot. The arm64 >>>>>>>>> implementation in this series mainly uses global, __ro_after_init >>>>>>>>> variables. I've tried using alternatives patching, but that performs worse >>>>>>>>> than loading from memory; I think due to code size bloat. >>>>>>>>> >>>>>>>>> Status >>>>>>>>> ====== >>>>>>>>> >>>>>>>>> When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only implemented >>>>>>>>> enough to compile the kernel image itself with defconfig (and a few other >>>>>>>>> bits and pieces). This is enough to build a kernel that can boot under QEMU >>>>>>>>> or FVP. I'll happily do the rest of the work to enable all the extra >>>>>>>>> drivers, but wanted to get feedback on the shape of this effort first. If >>>>>>>>> anyone wants to do any testing, and has a must-have config, let me know and >>>>>>>>> I'll prioritize enabling it first. >>>>>>>>> >>>>>>>>> The series is arranged as follows: >>>>>>>>> >>>>>>>>> - patch 1: Add macros required for converting non-arch code to support >>>>>>>>> boot-time page size selection >>>>>>>>> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from >>>>>>>>> all non-arch code >>>>>>>>> - patches 37-38: Some arm64 tidy ups >>>>>>>>> - patch 39: Add macros required for converting arm64 code to >>>>>>>> support >>>>>>>>> boot-time page size selection >>>>>>>>> - patches 40-56: arm64 changes to support boot-time page size selection >>>>>>>>> - patch 57: Add arm64 Kconfig option to enable boot-time page >>>>>>>> size >>>>>>>>> selection >>>>>>>>> >>>>>>>>> Ideally, I'd like to get the basics merged (something like this series), >>>>>>>>> then incrementally improve it over a handful of kernel releases until we >>>>>>>>> can demonstrate that we have feature parity with the compile-time build and >>>>>>>>> no performance blockers. Once at that point, ideally the compile-time build >>>>>>>>> options would be removed and the code could be cleaned up further. >>>>>>>>> >>>>>>>>> One of the bigger peices that I'd propose to add as a follow up, is to make >>>>>>>>> va-size boot-time selectable too. That will greatly simplify LPA2 fallback >>>>>>>>> handling. >>>>>>>>> >>>>>>>>> Assuming people are ammenable to the rough shape, how would I go about >>>>>>>>> getting the non-arch changes merged? Since they cover many subsystems, will >>>>>>>>> each piece need to go independently to each relevant maintainer or could it >>>>>>>>> all be merged together through the arm64 tree? >>>>>>>>> >>>>>>>>> Image Size >>>>>>>>> ========== >>>>>>>>> >>>>>>>>> The below shows the size of a defconfig (+ xfs, squashfs, ftrace, kprobes) >>>>>>>>> kernel image on disk for base (before any changes applied), compile (with >>>>>>>>> changes, configured for compile-time page size) and boot (with changes, >>>>>>>>> configured for boot-time page size). >>>>>>>>> >>>>>>>>> You can see the that compile-16k and 64k configs are actually slightly >>>>>>>>> smaller than the baselines; that's due to optimizing some buffer sizes >>>>>>>>> which didn't need to depend on page size during the series. The boot-time >>>>>>>>> image is ~1% bigger than the 64k compile-time image. I believe there is >>>>>>>>> scope to improve this to make it >>>>>>>>> equal to compile-64k if required: >>>>>>>>> | config | size/KB | diff/KB | diff/% | >>>>>>>>> | >>>>>>>>> |-------------|---------|---------|---------| >>>>>>>>> | >>>>>>>>> | base-4k | 54895 | 0 | 0.0% | >>>>>>>>> | base-16k | 55161 | 266 | 0.5% | >>>>>>>>> | base-64k | 56775 | 1880 | 3.4% | >>>>>>>>> | compile-4k | 54895 | 0 | 0.0% | >>>>>>>>> | compile-16k | 55097 | 202 | 0.4% | >>>>>>>>> | compile-64k | 56391 | 1496 | 2.7% | >>>>>>>>> | boot-4K | 57045 | 2150 | 3.9% | >>>>>>>>> >>>>>>>>> And below shows the size of the image in memory at run-time, separated for >>>>>>>>> text and data costs. The boot image has ~1% text cost; most likely due to >>>>>>>>> the fact that PAGE_SIZE and friends are not compile-time constants so need >>>>>>>>> instructions to load the values and do arithmetic. I believe we could >>>>>>>>> eventually get the data cost to match the cost for the compile image for >>>>>>>>> the chosen page size by freeing >>>>>>>>> the ends of the static buffers not needed for the selected page size: >>>>>>>>> | | text | text | text | data | data | data | >>>>>>>>> | >>>>>>>>> | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | diff/% | >>>>>>>>> | >>>>>>>>> |-------------|---------|---------|---------|---------|---------|---------| >>>>>>>>> | >>>>>>>>> | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | >>>>>>>>> | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | >>>>>>>>> | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | 9.5% | >>>>>>>>> | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | >>>>>>>>> | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | >>>>>>>>> | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | >>>>>>>>> | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | >>>>>>>>> >>>>>>>>> Functional Testing >>>>>>>>> ================== >>>>>>>>> >>>>>>>>> I've build-tested defconfig for all arches supported by tuxmake (which is >>>>>>>>> most) without issue. >>>>>>>>> >>>>>>>>> I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all page >>>>>>>>> sizes and a few va-sizes, and additionally have run all the mm-selftests, >>>>>>>>> with no regressions observed vs the equivalent compile-time page size build >>>>>>>>> (although the mm-selftests have a few existing failures when run against >>>>>>>>> 16K and 64K kernels - those should really be investigated and fixed >>>>>>>>> independently). >>>>>>>>> >>>>>>>>> Test coverage is lacking for many of the drivers that I've touched, but in >>>>>>>>> many cases, I'm hoping the changes are simple enough that review might >>>>>>>>> suffice? >>>>>>>>> >>>>>>>>> Performance Testing >>>>>>>>> =================== >>>>>>>>> >>>>>>>>> I've run some limited performance benchmarks: >>>>>>>>> >>>>>>>>> First, a real-world benchmark that causes a lot of page table manipulation >>>>>>>>> (and therefore we would expect to see regression here if we are going to >>>>>>>>> see it anywhere); kernel compilation. It barely registers a change. Values >>>>>>>>> are times, >>>>>>>>> so smaller is better. All relative to base-4k: >>>>>>>>> | | kern | kern | user | user | real | real | >>>>>>>>> | >>>>>>>>> | config | mean | stdev | mean | stdev | mean | stdev | >>>>>>>>> | >>>>>>>>> |-------------|---------|---------|---------|---------|---------|---------| >>>>>>>>> | >>>>>>>>> | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | >>>>>>>>> | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | >>>>>>>>> | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | >>>>>>>>> >>>>>>>>> The Speedometer JavaScript benchmark also shows no change. Values are runs >>>>>>>>> per >>>>>>>>> min, so bigger is better. All relative to base-4k: >>>>>>>>> | config | mean | stdev | >>>>>>>>> | >>>>>>>>> |-------------|---------|---------| >>>>>>>>> | >>>>>>>>> | base-4k | 0.0% | 0.8% | >>>>>>>>> | compile-4k | 0.4% | 0.8% | >>>>>>>>> | boot-4k | 0.0% | 0.9% | >>>>>>>>> >>>>>>>>> Finally, I've run some microbenchmarks known to stress page table >>>>>>>>> manipulations (originally from David Hildenbrand). The fork test >>>>>>>>> maps/allocs 1G of anon memory, then measures the cost of fork(). The munmap >>>>>>>>> test maps/allocs 1G of anon memory then measures the cost of munmap()ing >>>>>>>>> it. The fork test is known to be extremely sensitive to any changes that >>>>>>>>> cause instructions to be aligned differently in cachelines. When using this >>>>>>>>> test for other changes, I've seen double digit regressions for the >>>>>>>>> slightest thing, so 12% regression on this test is actually fairly good. >>>>>>>>> This likely represents the extreme worst case for regressions that will be >>>>>>>>> observed across other microbenchmarks (famous last >>>>>>>>> words). Values are times, so smaller is better. All relative to base-4k: >>>>>>>>> | | fork | fork | munmap | munmap | >>>>>>>>> | >>>>>>>>> | config | mean | stdev | stdev | stdev | >>>>>>>>> | >>>>>>>>> |-------------|---------|---------|---------|---------| >>>>>>>>> | >>>>>>>>> | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | >>>>>>>>> | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | >>>>>>>>> | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | >>>>>>>>> >>>>>>>>> NOTE: The series applies on top of v6.11. >>>>>>>>> >>>>>>>>> Thanks, >>>>>>>>> Ryan >>>>>>>>> >>>>>>>>> >>>>>>>>> Ryan Roberts (57): >>>>>>>>> mm: Add macros ahead of supporting boot-time page size selection >>>>>>>>> vmlinux: Align to PAGE_SIZE_MAX >>>>>>>>> mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is large >>>>>>>>> mm/page_alloc: Make page_frag_cache boot-time page size compatible >>>>>>>>> mm: Avoid split pmd ptl if pmd level is run-time folded >>>>>>>>> mm: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing >>>>>>>>> fs: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> fs/nfs: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> fs/ext4: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> fork: Permit boot-time THREAD_SIZE determination >>>>>>>>> cgroup: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> bpf: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> pm/hibernate: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> stackdepot: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> perf: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> kvm: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> trace: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> crash: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> crypto: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> sunrpc: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> sound: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> net: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> net: fec: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> net: marvell: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> net: hns3: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> net: e1000: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> net: igbvf: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> net: igb: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> drivers/base: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> edac: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> optee: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> random: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> sata_sil24: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> virtio: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> xen: Remove PAGE_SIZE compile-time constant assumption >>>>>>>>> arm64: Fix macros to work in C code in addition to the linker script >>>>>>>>> arm64: Track early pgtable allocation limit >>>>>>>>> arm64: Introduce macros required for boot-time page selection >>>>>>>>> arm64: Refactor early pgtable size calculation macros >>>>>>>>> arm64: Pass desired page size on command line >>>>>>>>> arm64: Divorce early init from PAGE_SIZE >>>>>>>>> arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES >>>>>>>>> arm64: Align sections to PAGE_SIZE_MAX >>>>>>>>> arm64: Rework trampoline rodata mapping >>>>>>>>> arm64: Generalize fixmap for boot-time page size >>>>>>>>> arm64: Statically allocate and align for worst-case page size >>>>>>>>> arm64: Convert switch to if for non-const comparison values >>>>>>>>> arm64: Convert BUILD_BUG_ON to VM_BUG_ON >>>>>>>>> arm64: Remove PAGE_SZ asm-offset >>>>>>>>> arm64: Introduce cpu features for page sizes >>>>>>>>> arm64: Remove PAGE_SIZE from assembly code >>>>>>>>> arm64: Runtime-fold pmd level >>>>>>>>> arm64: Support runtime folding in idmap_kpti_install_ng_mappings >>>>>>>>> arm64: TRAMP_VALIAS is no longer compile-time constant >>>>>>>>> arm64: Determine THREAD_SIZE at boot-time >>>>>>>>> arm64: Enable boot-time page size selection >>>>>>>>> >>>>>>>>> arch/alpha/include/asm/page.h | 1 + >>>>>>>>> arch/arc/include/asm/page.h | 1 + >>>>>>>>> arch/arm/include/asm/page.h | 1 + >>>>>>>>> arch/arm64/Kconfig | 26 ++- >>>>>>>>> arch/arm64/include/asm/assembler.h | 78 ++++++- >>>>>>>>> arch/arm64/include/asm/cpufeature.h | 44 +++- >>>>>>>>> arch/arm64/include/asm/efi.h | 2 +- >>>>>>>>> arch/arm64/include/asm/fixmap.h | 28 ++- >>>>>>>>> arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- >>>>>>>>> arch/arm64/include/asm/kvm_arm.h | 21 +- >>>>>>>>> arch/arm64/include/asm/kvm_hyp.h | 11 + >>>>>>>>> arch/arm64/include/asm/kvm_pgtable.h | 6 +- >>>>>>>>> arch/arm64/include/asm/memory.h | 62 ++++-- >>>>>>>>> arch/arm64/include/asm/page-def.h | 3 +- >>>>>>>>> arch/arm64/include/asm/pgalloc.h | 16 +- >>>>>>>>> arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ >>>>>>>>> arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- >>>>>>>>> arch/arm64/include/asm/pgtable-prot.h | 2 +- >>>>>>>>> arch/arm64/include/asm/pgtable.h | 133 +++++++++--- >>>>>>>>> arch/arm64/include/asm/processor.h | 10 +- >>>>>>>>> arch/arm64/include/asm/sections.h | 1 + >>>>>>>>> arch/arm64/include/asm/smp.h | 1 + >>>>>>>>> arch/arm64/include/asm/sparsemem.h | 15 +- >>>>>>>>> arch/arm64/include/asm/sysreg.h | 54 +++-- >>>>>>>>> arch/arm64/include/asm/tlb.h | 3 + >>>>>>>>> arch/arm64/kernel/asm-offsets.c | 4 +- >>>>>>>>> arch/arm64/kernel/cpufeature.c | 93 ++++++-- >>>>>>>>> arch/arm64/kernel/efi.c | 2 +- >>>>>>>>> arch/arm64/kernel/entry.S | 60 +++++- >>>>>>>>> arch/arm64/kernel/head.S | 46 +++- >>>>>>>>> arch/arm64/kernel/hibernate-asm.S | 6 +- >>>>>>>>> arch/arm64/kernel/image-vars.h | 14 ++ >>>>>>>>> arch/arm64/kernel/image.h | 4 + >>>>>>>>> arch/arm64/kernel/pi/idreg-override.c | 68 +++++- >>>>>>>>> arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- >>>>>>>>> arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- >>>>>>>>> arch/arm64/kernel/pi/pi.h | 63 +++++- >>>>>>>>> arch/arm64/kernel/relocate_kernel.S | 10 +- >>>>>>>>> arch/arm64/kernel/vdso-wrap.S | 4 +- >>>>>>>>> arch/arm64/kernel/vdso.c | 7 +- >>>>>>>>> arch/arm64/kernel/vdso/vdso.lds.S | 4 +- >>>>>>>>> arch/arm64/kernel/vdso32-wrap.S | 4 +- >>>>>>>>> arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- >>>>>>>>> arch/arm64/kernel/vmlinux.lds.S | 48 +++-- >>>>>>>>> arch/arm64/kvm/arm.c | 10 + >>>>>>>>> arch/arm64/kvm/hyp/nvhe/Makefile | 1 + >>>>>>>>> arch/arm64/kvm/hyp/nvhe/host.S | 10 +- >>>>>>>>> arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- >>>>>>>>> arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ >>>>>>>>> arch/arm64/kvm/mmu.c | 39 ++-- >>>>>>>>> arch/arm64/lib/clear_page.S | 7 +- >>>>>>>>> arch/arm64/lib/copy_page.S | 33 ++- >>>>>>>>> arch/arm64/lib/mte.S | 27 ++- >>>>>>>>> arch/arm64/mm/Makefile | 1 + >>>>>>>>> arch/arm64/mm/fixmap.c | 38 ++-- >>>>>>>>> arch/arm64/mm/hugetlbpage.c | 40 +--- >>>>>>>>> arch/arm64/mm/init.c | 26 +-- >>>>>>>>> arch/arm64/mm/kasan_init.c | 8 +- >>>>>>>>> arch/arm64/mm/mmu.c | 53 +++-- >>>>>>>>> arch/arm64/mm/pgd.c | 12 +- >>>>>>>>> arch/arm64/mm/pgtable-geometry.c | 24 +++ >>>>>>>>> arch/arm64/mm/proc.S | 128 ++++++++--- >>>>>>>>> arch/arm64/mm/ptdump.c | 3 +- >>>>>>>>> arch/arm64/tools/cpucaps | 3 + >>>>>>>>> arch/csky/include/asm/page.h | 3 + >>>>>>>>> arch/hexagon/include/asm/page.h | 2 + >>>>>>>>> arch/loongarch/include/asm/page.h | 2 + >>>>>>>>> arch/m68k/include/asm/page.h | 1 + >>>>>>>>> arch/microblaze/include/asm/page.h | 1 + >>>>>>>>> arch/mips/include/asm/page.h | 1 + >>>>>>>>> arch/nios2/include/asm/page.h | 2 + >>>>>>>>> arch/openrisc/include/asm/page.h | 1 + >>>>>>>>> arch/parisc/include/asm/page.h | 1 + >>>>>>>>> arch/powerpc/include/asm/page.h | 2 + >>>>>>>>> arch/riscv/include/asm/page.h | 1 + >>>>>>>>> arch/s390/include/asm/page.h | 1 + >>>>>>>>> arch/sh/include/asm/page.h | 1 + >>>>>>>>> arch/sparc/include/asm/page.h | 3 + >>>>>>>>> arch/um/include/asm/page.h | 2 + >>>>>>>>> arch/x86/include/asm/page_types.h | 2 + >>>>>>>>> arch/xtensa/include/asm/page.h | 1 + >>>>>>>>> crypto/lskcipher.c | 4 +- >>>>>>>>> drivers/ata/sata_sil24.c | 46 ++-- >>>>>>>>> drivers/base/node.c | 6 +- >>>>>>>>> drivers/base/topology.c | 32 +-- >>>>>>>>> drivers/block/virtio_blk.c | 2 +- >>>>>>>>> drivers/char/random.c | 4 +- >>>>>>>>> drivers/edac/edac_mc.h | 13 +- >>>>>>>>> drivers/firmware/efi/libstub/arm64.c | 3 +- >>>>>>>>> drivers/irqchip/irq-gic-v3-its.c | 2 +- >>>>>>>>> drivers/mtd/mtdswap.c | 4 +- >>>>>>>>> drivers/net/ethernet/freescale/fec.h | 3 +- >>>>>>>>> drivers/net/ethernet/freescale/fec_main.c | 5 +- >>>>>>>>> .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- >>>>>>>>> drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- >>>>>>>>> drivers/net/ethernet/intel/igb/igb.h | 25 +-- >>>>>>>>> drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ >>>>>>>>> drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- >>>>>>>>> drivers/net/ethernet/marvell/mvneta.c | 9 +- >>>>>>>>> drivers/net/ethernet/marvell/sky2.h | 2 +- >>>>>>>>> drivers/tee/optee/call.c | 7 +- >>>>>>>>> drivers/tee/optee/smc_abi.c | 2 +- >>>>>>>>> drivers/virtio/virtio_balloon.c | 10 +- >>>>>>>>> drivers/xen/balloon.c | 11 +- >>>>>>>>> drivers/xen/biomerge.c | 12 +- >>>>>>>>> drivers/xen/privcmd.c | 2 +- >>>>>>>>> drivers/xen/xenbus/xenbus_client.c | 5 +- >>>>>>>>> drivers/xen/xlate_mmu.c | 6 +- >>>>>>>>> fs/binfmt_elf.c | 11 +- >>>>>>>>> fs/buffer.c | 2 +- >>>>>>>>> fs/coredump.c | 8 +- >>>>>>>>> fs/ext4/ext4.h | 36 ++-- >>>>>>>>> fs/ext4/move_extent.c | 2 +- >>>>>>>>> fs/ext4/readpage.c | 2 +- >>>>>>>>> fs/fat/dir.c | 4 +- >>>>>>>>> fs/fat/fatent.c | 4 +- >>>>>>>>> fs/nfs/nfs42proc.c | 2 +- >>>>>>>>> fs/nfs/nfs42xattr.c | 2 +- >>>>>>>>> fs/nfs/nfs4proc.c | 2 +- >>>>>>>>> include/asm-generic/pgtable-geometry.h | 71 +++++++ >>>>>>>>> include/asm-generic/vmlinux.lds.h | 38 ++-- >>>>>>>>> include/linux/buffer_head.h | 1 + >>>>>>>>> include/linux/cpumask.h | 5 + >>>>>>>>> include/linux/linkage.h | 4 +- >>>>>>>>> include/linux/mm.h | 17 +- >>>>>>>>> include/linux/mm_types.h | 15 +- >>>>>>>>> include/linux/mm_types_task.h | 2 +- >>>>>>>>> include/linux/mmzone.h | 3 +- >>>>>>>>> include/linux/netlink.h | 6 +- >>>>>>>>> include/linux/percpu-defs.h | 4 +- >>>>>>>>> include/linux/perf_event.h | 2 +- >>>>>>>>> include/linux/sched.h | 4 +- >>>>>>>>> include/linux/slab.h | 7 +- >>>>>>>>> include/linux/stackdepot.h | 6 +- >>>>>>>>> include/linux/sunrpc/svc.h | 8 +- >>>>>>>>> include/linux/sunrpc/svc_rdma.h | 4 +- >>>>>>>>> include/linux/sunrpc/svcsock.h | 2 +- >>>>>>>>> include/linux/swap.h | 17 +- >>>>>>>>> include/linux/swapops.h | 6 +- >>>>>>>>> include/linux/thread_info.h | 10 +- >>>>>>>>> include/xen/page.h | 2 + >>>>>>>>> init/main.c | 7 +- >>>>>>>>> kernel/bpf/core.c | 9 +- >>>>>>>>> kernel/bpf/ringbuf.c | 54 ++--- >>>>>>>>> kernel/cgroup/cgroup.c | 8 +- >>>>>>>>> kernel/crash_core.c | 2 +- >>>>>>>>> kernel/events/core.c | 2 +- >>>>>>>>> kernel/fork.c | 71 +++---- >>>>>>>>> kernel/power/power.h | 2 +- >>>>>>>>> kernel/power/snapshot.c | 2 +- >>>>>>>>> kernel/power/swap.c | 129 +++++++++-- >>>>>>>>> kernel/trace/fgraph.c | 2 +- >>>>>>>>> kernel/trace/trace.c | 2 +- >>>>>>>>> lib/stackdepot.c | 6 +- >>>>>>>>> mm/kasan/report.c | 3 +- >>>>>>>>> mm/memcontrol.c | 11 +- >>>>>>>>> mm/memory.c | 4 +- >>>>>>>>> mm/mmap.c | 2 +- >>>>>>>>> mm/page-writeback.c | 2 +- >>>>>>>>> mm/page_alloc.c | 31 +-- >>>>>>>>> mm/slub.c | 2 +- >>>>>>>>> mm/sparse.c | 2 +- >>>>>>>>> mm/swapfile.c | 2 +- >>>>>>>>> mm/vmalloc.c | 7 +- >>>>>>>>> net/9p/trans_virtio.c | 4 +- >>>>>>>>> net/core/hotdata.c | 4 +- >>>>>>>>> net/core/skbuff.c | 4 +- >>>>>>>>> net/core/sysctl_net_core.c | 2 +- >>>>>>>>> net/sunrpc/cache.c | 3 +- >>>>>>>>> net/unix/af_unix.c | 2 +- >>>>>>>>> sound/soc/soc-utils.c | 4 +- >>>>>>>>> virt/kvm/kvm_main.c | 2 +- >>>>>>>>> 172 files changed, 2185 insertions(+), 951 deletions(-) >>>>>>>>> create mode 100644 arch/arm64/include/asm/pgtable-geometry.h >>>>>>>>> create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c >>>>>>>>> create mode 100644 arch/arm64/mm/pgtable-geometry.c >>>>>>>>> create mode 100644 include/asm-generic/pgtable-geometry.h >>>>>>>>> >>>>>>>>> -- >>>>>>>>> 2.43.0 >>>>>>>> >>>>>>>> This is a generally very exciting patch set! I'm looking forward to seeing it >>>>>>>> land so I can take advantage of it for Fedora ARM and Fedora Asahi Remix. >>>>>>>> >>>>>>>> That said, I have a couple of questions: >>>>>>>> >>>>>>>> * Going forward, how would we handle drivers/modules that require a particular >>>>>>>> page size? For example, the Apple Silicon IOMMU driver code requires the >>>>>>>> kernel to operate in 16k page size mode, and it would need to be disabled in >>>>>>>> other page sizes. >>>>>>> >>>>>>> I think these drivers would want to check PAGE_SIZE at probe time and fail if an >>>>>>> unsupported page size is in use. Do you see any issue with that? >>>>>>> >>>>>>>> >>>>>>>> * How would we handle an invalid selection at boot? >>>>>>> >>>>>>> What do you mean by invalid here? The current policy validates that the >>>>>>> requested page size is supported by the HW by checking mmfr0. If no page size is >>>>>>> passed on the command line, or the passed value is not supported by the HW, then >>>>>>> the we default to the largest page size supported by the HW (so for Apple >>>>>>> Silicon that would be 16k since the HW doesn't support 64k). Although I think it >>>>>>> may be better to change that policy to use the smallest page size in this case; >>>>>>> 4k is the safer bet for compat and will waste much less memory than 64k. >>>>>>> >>>>>>>> Can we program in a >>>>>>>> fallback when the "wrong" mode is selected for a chip or something similar? >>>>>>> >>>>>>> Do you mean effectively add a machanism to force 16k if the detected HW is Apple >>>>>>> Silicon? The trouble is that we need to select the page size, very early in >>>>>>> boot, before start_kernel() is called, so we really only have generic arch code >>>>>>> and the command line with which to make the decision. >>>>>> >>>>>> Yes... I think a build-time CONFIG for default page size, which can be >>>>>> overridden by a karg makes sense... Even on platforms like Apple >>>>>> Silicon you may want to test very specific things in 4k by overriding >>>>>> with a karg. >>>>> >>>>> Ahh, yes, that would certainly work. I'll work it into the next version. >>>>> >>>> >>>> Could we maybe extend to have some kind of way to include a table of >>>> SoC IDs that certain modes are disabled (e.g. 64k on Apple Silicon) >>> >>> 64k is already disabled on Apple Silicon because mmfr0 reports that 64k is not >>> supported. >>> >>>> and preferred modes when no arg is set (16k for Apple Silicon)? That >>> >>> And it's not obvious that we should hard-code a page size preference to a SoC >>> ID. If the CPU can support multiple page sizes, it should be up to the SW stack >>> to decide, not the SoC. >>> >>> I'm guessing your desire is to have a single kernel build that will boot 16k by >>> default on Apple Silicon and 4k by default on other systems, all without needing >>> to modify the command line? Personally I think it's cleaner to just require >>> setting the page size on the command line in these cases. >>> >>>> way it'd work something like this: >>>> >>>> 1. Table identification of 4/16/64 depending on identified SoC >>> So I'd prefer not to have this >>> >>>> 2. Unidentified ones follow build-time default >>>> 3. karg forces a mode regardless >>> But keep these 2. >>> >> > Since we are talking about Apple Silicon and page size, I would like to > add that on the Apple Silicon SoCs I am working on, the situation is like > this: > > Apple A7 (s5l8960x), A8 (T7000), A8X (T7001): CPU MMU support 4K and 64K > page sizes. > > Apple A9 (s8000/s8003), A9X (s8001), A10 (t8010), A10X (t8011), A11 (t8015): > CPU MMU Support 16K and 64K page sizes. > > However, all of them have 4K page DART IOMMUs. > >> I think it makes sense to have it, because it's not just Apple Silicon >> where such a preference/requirement may be necessary. Apple Silicon >> technically works at 4k, but is completely broken at 4k because Linux >> cannot do 16k IOMMU with 4k everything else, so being able to at least >> prefer 16k out of the box is important. And SoCs like the NVIDIA Grace >> Hopper platform prefer 64k over other options (though I am unaware of >> a gross incompatibility that effectively requires it like Apple >> Silicon has). >> >> When we're trying to get to "single generic image that works >> everywhere", stuff like this matters and I would really like you to >> consider it from the lens of "we want things to work as automagic as >> they do on x86". > For me, in order to get to this level of automagic, there do need to be > a table of which SoC should use which page size table. OK, but it's not clear to me that this table needs to be in the kernel. Could it not be something in user space (e.g. during installation) that configures the kernel command line? Regardless, the hard work here is getting the boot-time page size selection mechanism in place. Once that's there, follow up patches can add the desired policy. I'd rather leave it out for now to avoid anything slowing down the core work. Thanks, Ryan > >> >> > > Nick Chan >
On Tue, Oct 22, 2024 at 11:12 AM Ryan Roberts <ryan.roberts@arm.com> wrote: > > On 22/10/2024 16:03, Nick Chan wrote: > > > > > > Neal Gompa 於 2024/10/22 下午5:33 寫道: > >> On Mon, Oct 21, 2024 at 11:02 AM Ryan Roberts <ryan.roberts@arm.com> wrote: > >>> > >>> On 21/10/2024 14:49, Neal Gompa wrote: > >>>> On Mon, Oct 21, 2024 at 7:51 AM Ryan Roberts <ryan.roberts@arm.com> wrote: > >>>>> > >>>>> On 21/10/2024 12:32, Eric Curtin wrote: > >>>>>> On Mon, 21 Oct 2024 at 12:09, Ryan Roberts <ryan.roberts@arm.com> wrote: > >>>>>>> > >>>>>>> On 19/10/2024 16:47, Neal Gompa wrote: > >>>>>>>> On Monday, October 14, 2024 6:55:11 AM EDT Ryan Roberts wrote: > >>>>>>>>> Hi All, > >>>>>>>>> > >>>>>>>>> Patch bomb incoming... This covers many subsystems, so I've included a core > >>>>>>>>> set of people on the full series and additionally included maintainers on > >>>>>>>>> relevant patches. I haven't included those maintainers on this cover letter > >>>>>>>>> since the numbers were far too big for it to work. But I've included a link > >>>>>>>>> to this cover letter on each patch, so they can hopefully find their way > >>>>>>>>> here. For follow up submissions I'll break it up by subsystem, but for now > >>>>>>>>> thought it was important to show the full picture. > >>>>>>>>> > >>>>>>>>> This RFC series implements support for boot-time page size selection within > >>>>>>>>> the arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to > >>>>>>>>> date, page size has been selected at compile-time, meaning the size is > >>>>>>>>> baked into a given kernel image. As use of larger-than-4K page sizes become > >>>>>>>>> more prevalent this starts to present a problem for distributions. > >>>>>>>>> Boot-time page size selection enables the creation of a single kernel > >>>>>>>>> image, which can be told which page size to use on the kernel command line. > >>>>>>>>> > >>>>>>>>> Why is having an image-per-page size problematic? > >>>>>>>>> ================================================= > >>>>>>>>> > >>>>>>>>> Many traditional distros are now supporting both 4K and 64K. And this means > >>>>>>>>> managing 2 kernel packages, along with drivers for each. For some, it means > >>>>>>>>> multiple installer flavours and multiple ISOs. All of this adds up to a > >>>>>>>>> less-than-ideal level of complexity. Additionally, Android now supports 4K > >>>>>>>>> and 16K kernels. I'm told having to explicitly manage their KABI for each > >>>>>>>>> kernel is painful, and the extra flash space required for both kernel > >>>>>>>>> images and the duplicated modules has been problematic. Boot-time page size > >>>>>>>>> selection solves all of this. > >>>>>>>>> > >>>>>>>>> Additionally, in starting to think about the longer term deployment story > >>>>>>>>> for D128 page tables, which Arm architecture now supports, a lot of the > >>>>>>>>> same problems need to be solved, so this work sets us up nicely for that. > >>>>>>>>> > >>>>>>>>> So what's the down side? > >>>>>>>>> ======================== > >>>>>>>>> > >>>>>>>>> Well nothing's free; Various static allocations in the kernel image must be > >>>>>>>>> sized for the worst case (largest supported page size), so image size is in > >>>>>>>>> line with size of 64K compile-time image. So if you're interested in 4K or > >>>>>>>>> 16K, there is a slight increase to the image size. But I expect that > >>>>>>>>> problem goes away if you're compressing the image - its just some extra > >>>>>>>>> zeros. At boot-time, I expect we could free the unused static storage once > >>>>>>>>> we know the page size - although that would be a follow up enhancement. > >>>>>>>>> > >>>>>>>>> And then there is performance. Since PAGE_SIZE and friends are no longer > >>>>>>>>> compile-time constants, we must look up their values and do arithmetic at > >>>>>>>>> runtime instead of compile-time. My early perf testing suggests this is > >>>>>>>>> inperceptible for real-world workloads, and only has small impact on > >>>>>>>>> microbenchmarks - more on this below. > >>>>>>>>> > >>>>>>>>> Approach > >>>>>>>>> ======== > >>>>>>>>> > >>>>>>>>> The basic idea is to rid the source of any assumptions that PAGE_SIZE and > >>>>>>>>> friends are compile-time constant, but in a way that allows the compiler to > >>>>>>>>> perform the same optimizations as was previously being done if they do turn > >>>>>>>>> out to be compile-time constant. Where constants are required, we use > >>>>>>>>> limits; PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full > >>>>>>>>> description of all the classes of problems to solve. > >>>>>>>>> > >>>>>>>>> By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to > >>>>>>>>> boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. > >>>>>>>>> arm64 does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE > >>>>>>>>> Kconfig, which is an alternative to selecting a compile-time page size. > >>>>>>>>> > >>>>>>>>> When boot-time page size is active, the arch pgtable geometry macro > >>>>>>>>> definitions resolve to something that can be configured at boot. The arm64 > >>>>>>>>> implementation in this series mainly uses global, __ro_after_init > >>>>>>>>> variables. I've tried using alternatives patching, but that performs worse > >>>>>>>>> than loading from memory; I think due to code size bloat. > >>>>>>>>> > >>>>>>>>> Status > >>>>>>>>> ====== > >>>>>>>>> > >>>>>>>>> When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only implemented > >>>>>>>>> enough to compile the kernel image itself with defconfig (and a few other > >>>>>>>>> bits and pieces). This is enough to build a kernel that can boot under QEMU > >>>>>>>>> or FVP. I'll happily do the rest of the work to enable all the extra > >>>>>>>>> drivers, but wanted to get feedback on the shape of this effort first. If > >>>>>>>>> anyone wants to do any testing, and has a must-have config, let me know and > >>>>>>>>> I'll prioritize enabling it first. > >>>>>>>>> > >>>>>>>>> The series is arranged as follows: > >>>>>>>>> > >>>>>>>>> - patch 1: Add macros required for converting non-arch code to support > >>>>>>>>> boot-time page size selection > >>>>>>>>> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from > >>>>>>>>> all non-arch code > >>>>>>>>> - patches 37-38: Some arm64 tidy ups > >>>>>>>>> - patch 39: Add macros required for converting arm64 code to > >>>>>>>> support > >>>>>>>>> boot-time page size selection > >>>>>>>>> - patches 40-56: arm64 changes to support boot-time page size selection > >>>>>>>>> - patch 57: Add arm64 Kconfig option to enable boot-time page > >>>>>>>> size > >>>>>>>>> selection > >>>>>>>>> > >>>>>>>>> Ideally, I'd like to get the basics merged (something like this series), > >>>>>>>>> then incrementally improve it over a handful of kernel releases until we > >>>>>>>>> can demonstrate that we have feature parity with the compile-time build and > >>>>>>>>> no performance blockers. Once at that point, ideally the compile-time build > >>>>>>>>> options would be removed and the code could be cleaned up further. > >>>>>>>>> > >>>>>>>>> One of the bigger peices that I'd propose to add as a follow up, is to make > >>>>>>>>> va-size boot-time selectable too. That will greatly simplify LPA2 fallback > >>>>>>>>> handling. > >>>>>>>>> > >>>>>>>>> Assuming people are ammenable to the rough shape, how would I go about > >>>>>>>>> getting the non-arch changes merged? Since they cover many subsystems, will > >>>>>>>>> each piece need to go independently to each relevant maintainer or could it > >>>>>>>>> all be merged together through the arm64 tree? > >>>>>>>>> > >>>>>>>>> Image Size > >>>>>>>>> ========== > >>>>>>>>> > >>>>>>>>> The below shows the size of a defconfig (+ xfs, squashfs, ftrace, kprobes) > >>>>>>>>> kernel image on disk for base (before any changes applied), compile (with > >>>>>>>>> changes, configured for compile-time page size) and boot (with changes, > >>>>>>>>> configured for boot-time page size). > >>>>>>>>> > >>>>>>>>> You can see the that compile-16k and 64k configs are actually slightly > >>>>>>>>> smaller than the baselines; that's due to optimizing some buffer sizes > >>>>>>>>> which didn't need to depend on page size during the series. The boot-time > >>>>>>>>> image is ~1% bigger than the 64k compile-time image. I believe there is > >>>>>>>>> scope to improve this to make it > >>>>>>>>> equal to compile-64k if required: > >>>>>>>>> | config | size/KB | diff/KB | diff/% | > >>>>>>>>> | > >>>>>>>>> |-------------|---------|---------|---------| > >>>>>>>>> | > >>>>>>>>> | base-4k | 54895 | 0 | 0.0% | > >>>>>>>>> | base-16k | 55161 | 266 | 0.5% | > >>>>>>>>> | base-64k | 56775 | 1880 | 3.4% | > >>>>>>>>> | compile-4k | 54895 | 0 | 0.0% | > >>>>>>>>> | compile-16k | 55097 | 202 | 0.4% | > >>>>>>>>> | compile-64k | 56391 | 1496 | 2.7% | > >>>>>>>>> | boot-4K | 57045 | 2150 | 3.9% | > >>>>>>>>> > >>>>>>>>> And below shows the size of the image in memory at run-time, separated for > >>>>>>>>> text and data costs. The boot image has ~1% text cost; most likely due to > >>>>>>>>> the fact that PAGE_SIZE and friends are not compile-time constants so need > >>>>>>>>> instructions to load the values and do arithmetic. I believe we could > >>>>>>>>> eventually get the data cost to match the cost for the compile image for > >>>>>>>>> the chosen page size by freeing > >>>>>>>>> the ends of the static buffers not needed for the selected page size: > >>>>>>>>> | | text | text | text | data | data | data | > >>>>>>>>> | > >>>>>>>>> | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | diff/% | > >>>>>>>>> | > >>>>>>>>> |-------------|---------|---------|---------|---------|---------|---------| > >>>>>>>>> | > >>>>>>>>> | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | > >>>>>>>>> | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | > >>>>>>>>> | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | 9.5% | > >>>>>>>>> | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | > >>>>>>>>> | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | > >>>>>>>>> | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | > >>>>>>>>> | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | > >>>>>>>>> > >>>>>>>>> Functional Testing > >>>>>>>>> ================== > >>>>>>>>> > >>>>>>>>> I've build-tested defconfig for all arches supported by tuxmake (which is > >>>>>>>>> most) without issue. > >>>>>>>>> > >>>>>>>>> I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all page > >>>>>>>>> sizes and a few va-sizes, and additionally have run all the mm-selftests, > >>>>>>>>> with no regressions observed vs the equivalent compile-time page size build > >>>>>>>>> (although the mm-selftests have a few existing failures when run against > >>>>>>>>> 16K and 64K kernels - those should really be investigated and fixed > >>>>>>>>> independently). > >>>>>>>>> > >>>>>>>>> Test coverage is lacking for many of the drivers that I've touched, but in > >>>>>>>>> many cases, I'm hoping the changes are simple enough that review might > >>>>>>>>> suffice? > >>>>>>>>> > >>>>>>>>> Performance Testing > >>>>>>>>> =================== > >>>>>>>>> > >>>>>>>>> I've run some limited performance benchmarks: > >>>>>>>>> > >>>>>>>>> First, a real-world benchmark that causes a lot of page table manipulation > >>>>>>>>> (and therefore we would expect to see regression here if we are going to > >>>>>>>>> see it anywhere); kernel compilation. It barely registers a change. Values > >>>>>>>>> are times, > >>>>>>>>> so smaller is better. All relative to base-4k: > >>>>>>>>> | | kern | kern | user | user | real | real | > >>>>>>>>> | > >>>>>>>>> | config | mean | stdev | mean | stdev | mean | stdev | > >>>>>>>>> | > >>>>>>>>> |-------------|---------|---------|---------|---------|---------|---------| > >>>>>>>>> | > >>>>>>>>> | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | > >>>>>>>>> | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | > >>>>>>>>> | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | > >>>>>>>>> > >>>>>>>>> The Speedometer JavaScript benchmark also shows no change. Values are runs > >>>>>>>>> per > >>>>>>>>> min, so bigger is better. All relative to base-4k: > >>>>>>>>> | config | mean | stdev | > >>>>>>>>> | > >>>>>>>>> |-------------|---------|---------| > >>>>>>>>> | > >>>>>>>>> | base-4k | 0.0% | 0.8% | > >>>>>>>>> | compile-4k | 0.4% | 0.8% | > >>>>>>>>> | boot-4k | 0.0% | 0.9% | > >>>>>>>>> > >>>>>>>>> Finally, I've run some microbenchmarks known to stress page table > >>>>>>>>> manipulations (originally from David Hildenbrand). The fork test > >>>>>>>>> maps/allocs 1G of anon memory, then measures the cost of fork(). The munmap > >>>>>>>>> test maps/allocs 1G of anon memory then measures the cost of munmap()ing > >>>>>>>>> it. The fork test is known to be extremely sensitive to any changes that > >>>>>>>>> cause instructions to be aligned differently in cachelines. When using this > >>>>>>>>> test for other changes, I've seen double digit regressions for the > >>>>>>>>> slightest thing, so 12% regression on this test is actually fairly good. > >>>>>>>>> This likely represents the extreme worst case for regressions that will be > >>>>>>>>> observed across other microbenchmarks (famous last > >>>>>>>>> words). Values are times, so smaller is better. All relative to base-4k: > >>>>>>>>> | | fork | fork | munmap | munmap | > >>>>>>>>> | > >>>>>>>>> | config | mean | stdev | stdev | stdev | > >>>>>>>>> | > >>>>>>>>> |-------------|---------|---------|---------|---------| > >>>>>>>>> | > >>>>>>>>> | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | > >>>>>>>>> | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | > >>>>>>>>> | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | > >>>>>>>>> > >>>>>>>>> NOTE: The series applies on top of v6.11. > >>>>>>>>> > >>>>>>>>> Thanks, > >>>>>>>>> Ryan > >>>>>>>>> > >>>>>>>>> > >>>>>>>>> Ryan Roberts (57): > >>>>>>>>> mm: Add macros ahead of supporting boot-time page size selection > >>>>>>>>> vmlinux: Align to PAGE_SIZE_MAX > >>>>>>>>> mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is large > >>>>>>>>> mm/page_alloc: Make page_frag_cache boot-time page size compatible > >>>>>>>>> mm: Avoid split pmd ptl if pmd level is run-time folded > >>>>>>>>> mm: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing > >>>>>>>>> fs: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> fs/nfs: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> fs/ext4: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> fork: Permit boot-time THREAD_SIZE determination > >>>>>>>>> cgroup: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> bpf: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> pm/hibernate: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> stackdepot: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> perf: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> kvm: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> trace: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> crash: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> crypto: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> sunrpc: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> sound: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> net: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> net: fec: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> net: marvell: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> net: hns3: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> net: e1000: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> net: igbvf: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> net: igb: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> drivers/base: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> edac: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> optee: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> random: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> sata_sil24: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> virtio: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> xen: Remove PAGE_SIZE compile-time constant assumption > >>>>>>>>> arm64: Fix macros to work in C code in addition to the linker script > >>>>>>>>> arm64: Track early pgtable allocation limit > >>>>>>>>> arm64: Introduce macros required for boot-time page selection > >>>>>>>>> arm64: Refactor early pgtable size calculation macros > >>>>>>>>> arm64: Pass desired page size on command line > >>>>>>>>> arm64: Divorce early init from PAGE_SIZE > >>>>>>>>> arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES > >>>>>>>>> arm64: Align sections to PAGE_SIZE_MAX > >>>>>>>>> arm64: Rework trampoline rodata mapping > >>>>>>>>> arm64: Generalize fixmap for boot-time page size > >>>>>>>>> arm64: Statically allocate and align for worst-case page size > >>>>>>>>> arm64: Convert switch to if for non-const comparison values > >>>>>>>>> arm64: Convert BUILD_BUG_ON to VM_BUG_ON > >>>>>>>>> arm64: Remove PAGE_SZ asm-offset > >>>>>>>>> arm64: Introduce cpu features for page sizes > >>>>>>>>> arm64: Remove PAGE_SIZE from assembly code > >>>>>>>>> arm64: Runtime-fold pmd level > >>>>>>>>> arm64: Support runtime folding in idmap_kpti_install_ng_mappings > >>>>>>>>> arm64: TRAMP_VALIAS is no longer compile-time constant > >>>>>>>>> arm64: Determine THREAD_SIZE at boot-time > >>>>>>>>> arm64: Enable boot-time page size selection > >>>>>>>>> > >>>>>>>>> arch/alpha/include/asm/page.h | 1 + > >>>>>>>>> arch/arc/include/asm/page.h | 1 + > >>>>>>>>> arch/arm/include/asm/page.h | 1 + > >>>>>>>>> arch/arm64/Kconfig | 26 ++- > >>>>>>>>> arch/arm64/include/asm/assembler.h | 78 ++++++- > >>>>>>>>> arch/arm64/include/asm/cpufeature.h | 44 +++- > >>>>>>>>> arch/arm64/include/asm/efi.h | 2 +- > >>>>>>>>> arch/arm64/include/asm/fixmap.h | 28 ++- > >>>>>>>>> arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- > >>>>>>>>> arch/arm64/include/asm/kvm_arm.h | 21 +- > >>>>>>>>> arch/arm64/include/asm/kvm_hyp.h | 11 + > >>>>>>>>> arch/arm64/include/asm/kvm_pgtable.h | 6 +- > >>>>>>>>> arch/arm64/include/asm/memory.h | 62 ++++-- > >>>>>>>>> arch/arm64/include/asm/page-def.h | 3 +- > >>>>>>>>> arch/arm64/include/asm/pgalloc.h | 16 +- > >>>>>>>>> arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ > >>>>>>>>> arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- > >>>>>>>>> arch/arm64/include/asm/pgtable-prot.h | 2 +- > >>>>>>>>> arch/arm64/include/asm/pgtable.h | 133 +++++++++--- > >>>>>>>>> arch/arm64/include/asm/processor.h | 10 +- > >>>>>>>>> arch/arm64/include/asm/sections.h | 1 + > >>>>>>>>> arch/arm64/include/asm/smp.h | 1 + > >>>>>>>>> arch/arm64/include/asm/sparsemem.h | 15 +- > >>>>>>>>> arch/arm64/include/asm/sysreg.h | 54 +++-- > >>>>>>>>> arch/arm64/include/asm/tlb.h | 3 + > >>>>>>>>> arch/arm64/kernel/asm-offsets.c | 4 +- > >>>>>>>>> arch/arm64/kernel/cpufeature.c | 93 ++++++-- > >>>>>>>>> arch/arm64/kernel/efi.c | 2 +- > >>>>>>>>> arch/arm64/kernel/entry.S | 60 +++++- > >>>>>>>>> arch/arm64/kernel/head.S | 46 +++- > >>>>>>>>> arch/arm64/kernel/hibernate-asm.S | 6 +- > >>>>>>>>> arch/arm64/kernel/image-vars.h | 14 ++ > >>>>>>>>> arch/arm64/kernel/image.h | 4 + > >>>>>>>>> arch/arm64/kernel/pi/idreg-override.c | 68 +++++- > >>>>>>>>> arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- > >>>>>>>>> arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- > >>>>>>>>> arch/arm64/kernel/pi/pi.h | 63 +++++- > >>>>>>>>> arch/arm64/kernel/relocate_kernel.S | 10 +- > >>>>>>>>> arch/arm64/kernel/vdso-wrap.S | 4 +- > >>>>>>>>> arch/arm64/kernel/vdso.c | 7 +- > >>>>>>>>> arch/arm64/kernel/vdso/vdso.lds.S | 4 +- > >>>>>>>>> arch/arm64/kernel/vdso32-wrap.S | 4 +- > >>>>>>>>> arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- > >>>>>>>>> arch/arm64/kernel/vmlinux.lds.S | 48 +++-- > >>>>>>>>> arch/arm64/kvm/arm.c | 10 + > >>>>>>>>> arch/arm64/kvm/hyp/nvhe/Makefile | 1 + > >>>>>>>>> arch/arm64/kvm/hyp/nvhe/host.S | 10 +- > >>>>>>>>> arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- > >>>>>>>>> arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ > >>>>>>>>> arch/arm64/kvm/mmu.c | 39 ++-- > >>>>>>>>> arch/arm64/lib/clear_page.S | 7 +- > >>>>>>>>> arch/arm64/lib/copy_page.S | 33 ++- > >>>>>>>>> arch/arm64/lib/mte.S | 27 ++- > >>>>>>>>> arch/arm64/mm/Makefile | 1 + > >>>>>>>>> arch/arm64/mm/fixmap.c | 38 ++-- > >>>>>>>>> arch/arm64/mm/hugetlbpage.c | 40 +--- > >>>>>>>>> arch/arm64/mm/init.c | 26 +-- > >>>>>>>>> arch/arm64/mm/kasan_init.c | 8 +- > >>>>>>>>> arch/arm64/mm/mmu.c | 53 +++-- > >>>>>>>>> arch/arm64/mm/pgd.c | 12 +- > >>>>>>>>> arch/arm64/mm/pgtable-geometry.c | 24 +++ > >>>>>>>>> arch/arm64/mm/proc.S | 128 ++++++++--- > >>>>>>>>> arch/arm64/mm/ptdump.c | 3 +- > >>>>>>>>> arch/arm64/tools/cpucaps | 3 + > >>>>>>>>> arch/csky/include/asm/page.h | 3 + > >>>>>>>>> arch/hexagon/include/asm/page.h | 2 + > >>>>>>>>> arch/loongarch/include/asm/page.h | 2 + > >>>>>>>>> arch/m68k/include/asm/page.h | 1 + > >>>>>>>>> arch/microblaze/include/asm/page.h | 1 + > >>>>>>>>> arch/mips/include/asm/page.h | 1 + > >>>>>>>>> arch/nios2/include/asm/page.h | 2 + > >>>>>>>>> arch/openrisc/include/asm/page.h | 1 + > >>>>>>>>> arch/parisc/include/asm/page.h | 1 + > >>>>>>>>> arch/powerpc/include/asm/page.h | 2 + > >>>>>>>>> arch/riscv/include/asm/page.h | 1 + > >>>>>>>>> arch/s390/include/asm/page.h | 1 + > >>>>>>>>> arch/sh/include/asm/page.h | 1 + > >>>>>>>>> arch/sparc/include/asm/page.h | 3 + > >>>>>>>>> arch/um/include/asm/page.h | 2 + > >>>>>>>>> arch/x86/include/asm/page_types.h | 2 + > >>>>>>>>> arch/xtensa/include/asm/page.h | 1 + > >>>>>>>>> crypto/lskcipher.c | 4 +- > >>>>>>>>> drivers/ata/sata_sil24.c | 46 ++-- > >>>>>>>>> drivers/base/node.c | 6 +- > >>>>>>>>> drivers/base/topology.c | 32 +-- > >>>>>>>>> drivers/block/virtio_blk.c | 2 +- > >>>>>>>>> drivers/char/random.c | 4 +- > >>>>>>>>> drivers/edac/edac_mc.h | 13 +- > >>>>>>>>> drivers/firmware/efi/libstub/arm64.c | 3 +- > >>>>>>>>> drivers/irqchip/irq-gic-v3-its.c | 2 +- > >>>>>>>>> drivers/mtd/mtdswap.c | 4 +- > >>>>>>>>> drivers/net/ethernet/freescale/fec.h | 3 +- > >>>>>>>>> drivers/net/ethernet/freescale/fec_main.c | 5 +- > >>>>>>>>> .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- > >>>>>>>>> drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- > >>>>>>>>> drivers/net/ethernet/intel/igb/igb.h | 25 +-- > >>>>>>>>> drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ > >>>>>>>>> drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- > >>>>>>>>> drivers/net/ethernet/marvell/mvneta.c | 9 +- > >>>>>>>>> drivers/net/ethernet/marvell/sky2.h | 2 +- > >>>>>>>>> drivers/tee/optee/call.c | 7 +- > >>>>>>>>> drivers/tee/optee/smc_abi.c | 2 +- > >>>>>>>>> drivers/virtio/virtio_balloon.c | 10 +- > >>>>>>>>> drivers/xen/balloon.c | 11 +- > >>>>>>>>> drivers/xen/biomerge.c | 12 +- > >>>>>>>>> drivers/xen/privcmd.c | 2 +- > >>>>>>>>> drivers/xen/xenbus/xenbus_client.c | 5 +- > >>>>>>>>> drivers/xen/xlate_mmu.c | 6 +- > >>>>>>>>> fs/binfmt_elf.c | 11 +- > >>>>>>>>> fs/buffer.c | 2 +- > >>>>>>>>> fs/coredump.c | 8 +- > >>>>>>>>> fs/ext4/ext4.h | 36 ++-- > >>>>>>>>> fs/ext4/move_extent.c | 2 +- > >>>>>>>>> fs/ext4/readpage.c | 2 +- > >>>>>>>>> fs/fat/dir.c | 4 +- > >>>>>>>>> fs/fat/fatent.c | 4 +- > >>>>>>>>> fs/nfs/nfs42proc.c | 2 +- > >>>>>>>>> fs/nfs/nfs42xattr.c | 2 +- > >>>>>>>>> fs/nfs/nfs4proc.c | 2 +- > >>>>>>>>> include/asm-generic/pgtable-geometry.h | 71 +++++++ > >>>>>>>>> include/asm-generic/vmlinux.lds.h | 38 ++-- > >>>>>>>>> include/linux/buffer_head.h | 1 + > >>>>>>>>> include/linux/cpumask.h | 5 + > >>>>>>>>> include/linux/linkage.h | 4 +- > >>>>>>>>> include/linux/mm.h | 17 +- > >>>>>>>>> include/linux/mm_types.h | 15 +- > >>>>>>>>> include/linux/mm_types_task.h | 2 +- > >>>>>>>>> include/linux/mmzone.h | 3 +- > >>>>>>>>> include/linux/netlink.h | 6 +- > >>>>>>>>> include/linux/percpu-defs.h | 4 +- > >>>>>>>>> include/linux/perf_event.h | 2 +- > >>>>>>>>> include/linux/sched.h | 4 +- > >>>>>>>>> include/linux/slab.h | 7 +- > >>>>>>>>> include/linux/stackdepot.h | 6 +- > >>>>>>>>> include/linux/sunrpc/svc.h | 8 +- > >>>>>>>>> include/linux/sunrpc/svc_rdma.h | 4 +- > >>>>>>>>> include/linux/sunrpc/svcsock.h | 2 +- > >>>>>>>>> include/linux/swap.h | 17 +- > >>>>>>>>> include/linux/swapops.h | 6 +- > >>>>>>>>> include/linux/thread_info.h | 10 +- > >>>>>>>>> include/xen/page.h | 2 + > >>>>>>>>> init/main.c | 7 +- > >>>>>>>>> kernel/bpf/core.c | 9 +- > >>>>>>>>> kernel/bpf/ringbuf.c | 54 ++--- > >>>>>>>>> kernel/cgroup/cgroup.c | 8 +- > >>>>>>>>> kernel/crash_core.c | 2 +- > >>>>>>>>> kernel/events/core.c | 2 +- > >>>>>>>>> kernel/fork.c | 71 +++---- > >>>>>>>>> kernel/power/power.h | 2 +- > >>>>>>>>> kernel/power/snapshot.c | 2 +- > >>>>>>>>> kernel/power/swap.c | 129 +++++++++-- > >>>>>>>>> kernel/trace/fgraph.c | 2 +- > >>>>>>>>> kernel/trace/trace.c | 2 +- > >>>>>>>>> lib/stackdepot.c | 6 +- > >>>>>>>>> mm/kasan/report.c | 3 +- > >>>>>>>>> mm/memcontrol.c | 11 +- > >>>>>>>>> mm/memory.c | 4 +- > >>>>>>>>> mm/mmap.c | 2 +- > >>>>>>>>> mm/page-writeback.c | 2 +- > >>>>>>>>> mm/page_alloc.c | 31 +-- > >>>>>>>>> mm/slub.c | 2 +- > >>>>>>>>> mm/sparse.c | 2 +- > >>>>>>>>> mm/swapfile.c | 2 +- > >>>>>>>>> mm/vmalloc.c | 7 +- > >>>>>>>>> net/9p/trans_virtio.c | 4 +- > >>>>>>>>> net/core/hotdata.c | 4 +- > >>>>>>>>> net/core/skbuff.c | 4 +- > >>>>>>>>> net/core/sysctl_net_core.c | 2 +- > >>>>>>>>> net/sunrpc/cache.c | 3 +- > >>>>>>>>> net/unix/af_unix.c | 2 +- > >>>>>>>>> sound/soc/soc-utils.c | 4 +- > >>>>>>>>> virt/kvm/kvm_main.c | 2 +- > >>>>>>>>> 172 files changed, 2185 insertions(+), 951 deletions(-) > >>>>>>>>> create mode 100644 arch/arm64/include/asm/pgtable-geometry.h > >>>>>>>>> create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c > >>>>>>>>> create mode 100644 arch/arm64/mm/pgtable-geometry.c > >>>>>>>>> create mode 100644 include/asm-generic/pgtable-geometry.h > >>>>>>>>> > >>>>>>>>> -- > >>>>>>>>> 2.43.0 > >>>>>>>> > >>>>>>>> This is a generally very exciting patch set! I'm looking forward to seeing it > >>>>>>>> land so I can take advantage of it for Fedora ARM and Fedora Asahi Remix. > >>>>>>>> > >>>>>>>> That said, I have a couple of questions: > >>>>>>>> > >>>>>>>> * Going forward, how would we handle drivers/modules that require a particular > >>>>>>>> page size? For example, the Apple Silicon IOMMU driver code requires the > >>>>>>>> kernel to operate in 16k page size mode, and it would need to be disabled in > >>>>>>>> other page sizes. > >>>>>>> > >>>>>>> I think these drivers would want to check PAGE_SIZE at probe time and fail if an > >>>>>>> unsupported page size is in use. Do you see any issue with that? > >>>>>>> > >>>>>>>> > >>>>>>>> * How would we handle an invalid selection at boot? > >>>>>>> > >>>>>>> What do you mean by invalid here? The current policy validates that the > >>>>>>> requested page size is supported by the HW by checking mmfr0. If no page size is > >>>>>>> passed on the command line, or the passed value is not supported by the HW, then > >>>>>>> the we default to the largest page size supported by the HW (so for Apple > >>>>>>> Silicon that would be 16k since the HW doesn't support 64k). Although I think it > >>>>>>> may be better to change that policy to use the smallest page size in this case; > >>>>>>> 4k is the safer bet for compat and will waste much less memory than 64k. > >>>>>>> > >>>>>>>> Can we program in a > >>>>>>>> fallback when the "wrong" mode is selected for a chip or something similar? > >>>>>>> > >>>>>>> Do you mean effectively add a machanism to force 16k if the detected HW is Apple > >>>>>>> Silicon? The trouble is that we need to select the page size, very early in > >>>>>>> boot, before start_kernel() is called, so we really only have generic arch code > >>>>>>> and the command line with which to make the decision. > >>>>>> > >>>>>> Yes... I think a build-time CONFIG for default page size, which can be > >>>>>> overridden by a karg makes sense... Even on platforms like Apple > >>>>>> Silicon you may want to test very specific things in 4k by overriding > >>>>>> with a karg. > >>>>> > >>>>> Ahh, yes, that would certainly work. I'll work it into the next version. > >>>>> > >>>> > >>>> Could we maybe extend to have some kind of way to include a table of > >>>> SoC IDs that certain modes are disabled (e.g. 64k on Apple Silicon) > >>> > >>> 64k is already disabled on Apple Silicon because mmfr0 reports that 64k is not > >>> supported. > >>> > >>>> and preferred modes when no arg is set (16k for Apple Silicon)? That > >>> > >>> And it's not obvious that we should hard-code a page size preference to a SoC > >>> ID. If the CPU can support multiple page sizes, it should be up to the SW stack > >>> to decide, not the SoC. > >>> > >>> I'm guessing your desire is to have a single kernel build that will boot 16k by > >>> default on Apple Silicon and 4k by default on other systems, all without needing > >>> to modify the command line? Personally I think it's cleaner to just require > >>> setting the page size on the command line in these cases. > >>> > >>>> way it'd work something like this: > >>>> > >>>> 1. Table identification of 4/16/64 depending on identified SoC > >>> So I'd prefer not to have this > >>> > >>>> 2. Unidentified ones follow build-time default > >>>> 3. karg forces a mode regardless > >>> But keep these 2. > >>> > >> > > Since we are talking about Apple Silicon and page size, I would like to > > add that on the Apple Silicon SoCs I am working on, the situation is like > > this: > > > > Apple A7 (s5l8960x), A8 (T7000), A8X (T7001): CPU MMU support 4K and 64K > > page sizes. > > > > Apple A9 (s8000/s8003), A9X (s8001), A10 (t8010), A10X (t8011), A11 (t8015): > > CPU MMU Support 16K and 64K page sizes. > > > > However, all of them have 4K page DART IOMMUs. > > > >> I think it makes sense to have it, because it's not just Apple Silicon > >> where such a preference/requirement may be necessary. Apple Silicon > >> technically works at 4k, but is completely broken at 4k because Linux > >> cannot do 16k IOMMU with 4k everything else, so being able to at least > >> prefer 16k out of the box is important. And SoCs like the NVIDIA Grace > >> Hopper platform prefer 64k over other options (though I am unaware of > >> a gross incompatibility that effectively requires it like Apple > >> Silicon has). > >> > >> When we're trying to get to "single generic image that works > >> everywhere", stuff like this matters and I would really like you to > >> consider it from the lens of "we want things to work as automagic as > >> they do on x86". > > For me, in order to get to this level of automagic, there do need to be > > a table of which SoC should use which page size table. > > OK, but it's not clear to me that this table needs to be in the kernel. Could it > not be something in user space (e.g. during installation) that configures the > kernel command line? > This is not compatible with using things like ISOs with UEFI+ACPI enabled desktop/server systems. We need to be able to safely, automatically, and correctly boot up and support hardware. The only place to do that early enough is in the kernel. But this can wait until the core stuff is in. > Regardless, the hard work here is getting the boot-time page size selection > mechanism in place. Once that's there, follow up patches can add the desired > policy. I'd rather leave it out for now to avoid anything slowing down the core > work. > Sure, this can be done afterward. -- 真実はいつも一つ!/ Always, there's only one truth!
On 22/10/2024 18:30, Neal Gompa wrote: [...] >>>>>>>>>> >>>>>>>>>> This is a generally very exciting patch set! I'm looking forward to seeing it >>>>>>>>>> land so I can take advantage of it for Fedora ARM and Fedora Asahi Remix. >>>>>>>>>> >>>>>>>>>> That said, I have a couple of questions: >>>>>>>>>> >>>>>>>>>> * Going forward, how would we handle drivers/modules that require a particular >>>>>>>>>> page size? For example, the Apple Silicon IOMMU driver code requires the >>>>>>>>>> kernel to operate in 16k page size mode, and it would need to be disabled in >>>>>>>>>> other page sizes. >>>>>>>>> >>>>>>>>> I think these drivers would want to check PAGE_SIZE at probe time and fail if an >>>>>>>>> unsupported page size is in use. Do you see any issue with that? >>>>>>>>> >>>>>>>>>> >>>>>>>>>> * How would we handle an invalid selection at boot? >>>>>>>>> >>>>>>>>> What do you mean by invalid here? The current policy validates that the >>>>>>>>> requested page size is supported by the HW by checking mmfr0. If no page size is >>>>>>>>> passed on the command line, or the passed value is not supported by the HW, then >>>>>>>>> the we default to the largest page size supported by the HW (so for Apple >>>>>>>>> Silicon that would be 16k since the HW doesn't support 64k). Although I think it >>>>>>>>> may be better to change that policy to use the smallest page size in this case; >>>>>>>>> 4k is the safer bet for compat and will waste much less memory than 64k. >>>>>>>>> >>>>>>>>>> Can we program in a >>>>>>>>>> fallback when the "wrong" mode is selected for a chip or something similar? >>>>>>>>> >>>>>>>>> Do you mean effectively add a machanism to force 16k if the detected HW is Apple >>>>>>>>> Silicon? The trouble is that we need to select the page size, very early in >>>>>>>>> boot, before start_kernel() is called, so we really only have generic arch code >>>>>>>>> and the command line with which to make the decision. >>>>>>>> >>>>>>>> Yes... I think a build-time CONFIG for default page size, which can be >>>>>>>> overridden by a karg makes sense... Even on platforms like Apple >>>>>>>> Silicon you may want to test very specific things in 4k by overriding >>>>>>>> with a karg. >>>>>>> >>>>>>> Ahh, yes, that would certainly work. I'll work it into the next version. >>>>>>> >>>>>> >>>>>> Could we maybe extend to have some kind of way to include a table of >>>>>> SoC IDs that certain modes are disabled (e.g. 64k on Apple Silicon) >>>>> >>>>> 64k is already disabled on Apple Silicon because mmfr0 reports that 64k is not >>>>> supported. >>>>> >>>>>> and preferred modes when no arg is set (16k for Apple Silicon)? That >>>>> >>>>> And it's not obvious that we should hard-code a page size preference to a SoC >>>>> ID. If the CPU can support multiple page sizes, it should be up to the SW stack >>>>> to decide, not the SoC. >>>>> >>>>> I'm guessing your desire is to have a single kernel build that will boot 16k by >>>>> default on Apple Silicon and 4k by default on other systems, all without needing >>>>> to modify the command line? Personally I think it's cleaner to just require >>>>> setting the page size on the command line in these cases. >>>>> >>>>>> way it'd work something like this: >>>>>> >>>>>> 1. Table identification of 4/16/64 depending on identified SoC >>>>> So I'd prefer not to have this >>>>> >>>>>> 2. Unidentified ones follow build-time default >>>>>> 3. karg forces a mode regardless >>>>> But keep these 2. >>>>> >>>> >>> Since we are talking about Apple Silicon and page size, I would like to >>> add that on the Apple Silicon SoCs I am working on, the situation is like >>> this: >>> >>> Apple A7 (s5l8960x), A8 (T7000), A8X (T7001): CPU MMU support 4K and 64K >>> page sizes. >>> >>> Apple A9 (s8000/s8003), A9X (s8001), A10 (t8010), A10X (t8011), A11 (t8015): >>> CPU MMU Support 16K and 64K page sizes. >>> >>> However, all of them have 4K page DART IOMMUs. >>> >>>> I think it makes sense to have it, because it's not just Apple Silicon >>>> where such a preference/requirement may be necessary. Apple Silicon >>>> technically works at 4k, but is completely broken at 4k because Linux >>>> cannot do 16k IOMMU with 4k everything else, so being able to at least >>>> prefer 16k out of the box is important. And SoCs like the NVIDIA Grace >>>> Hopper platform prefer 64k over other options (though I am unaware of >>>> a gross incompatibility that effectively requires it like Apple >>>> Silicon has). >>>> >>>> When we're trying to get to "single generic image that works >>>> everywhere", stuff like this matters and I would really like you to >>>> consider it from the lens of "we want things to work as automagic as >>>> they do on x86". >>> For me, in order to get to this level of automagic, there do need to be >>> a table of which SoC should use which page size table. >> >> OK, but it's not clear to me that this table needs to be in the kernel. Could it >> not be something in user space (e.g. during installation) that configures the >> kernel command line? >> > > This is not compatible with using things like ISOs with UEFI+ACPI > enabled desktop/server systems. We need to be able to safely, > automatically, and correctly boot up and support hardware. The only > place to do that early enough is in the kernel. But this can wait > until the core stuff is in. OK got it. > >> Regardless, the hard work here is getting the boot-time page size selection >> mechanism in place. Once that's there, follow up patches can add the desired >> policy. I'd rather leave it out for now to avoid anything slowing down the core >> work. >> > > Sure, this can be done afterward. Thanks! I understand the problem a bit better now. I'm sure we can find a solution once we have landed the core mechanism. Thanks, Ryan
On 10/14/24 06:55, Ryan Roberts wrote: > Hi All, > > Patch bomb incoming... This covers many subsystems, so I've included a core set > of people on the full series and additionally included maintainers on relevant > patches. I haven't included those maintainers on this cover letter since the > numbers were far too big for it to work. But I've included a link to this cover > letter on each patch, so they can hopefully find their way here. For follow up > submissions I'll break it up by subsystem, but for now thought it was important > to show the full picture. > > This RFC series implements support for boot-time page size selection within the > arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to date, page > size has been selected at compile-time, meaning the size is baked into a given > kernel image. As use of larger-than-4K page sizes become more prevalent this > starts to present a problem for distributions. Boot-time page size selection > enables the creation of a single kernel image, which can be told which page size > to use on the kernel command line. > > Why is having an image-per-page size problematic? > ================================================= > > Many traditional distros are now supporting both 4K and 64K. And this means > managing 2 kernel packages, along with drivers for each. For some, it means > multiple installer flavours and multiple ISOs. All of this adds up to a > less-than-ideal level of complexity. Additionally, Android now supports 4K and > 16K kernels. I'm told having to explicitly manage their KABI for each kernel is > painful, and the extra flash space required for both kernel images and the > duplicated modules has been problematic. Boot-time page size selection solves > all of this. > > Additionally, in starting to think about the longer term deployment story for > D128 page tables, which Arm architecture now supports, a lot of the same > problems need to be solved, so this work sets us up nicely for that. > > So what's the down side? > ======================== > > Well nothing's free; Various static allocations in the kernel image must be > sized for the worst case (largest supported page size), so image size is in line > with size of 64K compile-time image. So if you're interested in 4K or 16K, there > is a slight increase to the image size. But I expect that problem goes away if > you're compressing the image - its just some extra zeros. At boot-time, I expect > we could free the unused static storage once we know the page size - although > that would be a follow up enhancement. > > And then there is performance. Since PAGE_SIZE and friends are no longer > compile-time constants, we must look up their values and do arithmetic at > runtime instead of compile-time. My early perf testing suggests this is > inperceptible for real-world workloads, and only has small impact on > microbenchmarks - more on this below. > > Approach > ======== > > The basic idea is to rid the source of any assumptions that PAGE_SIZE and > friends are compile-time constant, but in a way that allows the compiler to > perform the same optimizations as was previously being done if they do turn out > to be compile-time constant. Where constants are required, we use limits; > PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full description > of all the classes of problems to solve. > > By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to > boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. arm64 > does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE Kconfig, > which is an alternative to selecting a compile-time page size. > > When boot-time page size is active, the arch pgtable geometry macro definitions > resolve to something that can be configured at boot. The arm64 implementation in > this series mainly uses global, __ro_after_init variables. I've tried using > alternatives patching, but that performs worse than loading from memory; I think > due to code size bloat. > > Status > ====== > > When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only implemented enough > to compile the kernel image itself with defconfig (and a few other bits and > pieces). This is enough to build a kernel that can boot under QEMU or FVP. I'll > happily do the rest of the work to enable all the extra drivers, but wanted to > get feedback on the shape of this effort first. If anyone wants to do any > testing, and has a must-have config, let me know and I'll prioritize enabling it > first. > > The series is arranged as follows: > > - patch 1: Add macros required for converting non-arch code to support > boot-time page size selection > - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from all > non-arch code > - patches 37-38: Some arm64 tidy ups > - patch 39: Add macros required for converting arm64 code to support > boot-time page size selection > - patches 40-56: arm64 changes to support boot-time page size selection > - patch 57: Add arm64 Kconfig option to enable boot-time page size > selection > > Ideally, I'd like to get the basics merged (something like this series), then > incrementally improve it over a handful of kernel releases until we can > demonstrate that we have feature parity with the compile-time build and no > performance blockers. Once at that point, ideally the compile-time build options > would be removed and the code could be cleaned up further. > > One of the bigger peices that I'd propose to add as a follow up, is to make > va-size boot-time selectable too. That will greatly simplify LPA2 fallback > handling. > > Assuming people are ammenable to the rough shape, how would I go about getting > the non-arch changes merged? Since they cover many subsystems, will each piece > need to go independently to each relevant maintainer or could it all be merged > together through the arm64 tree? > > Image Size > ========== > > The below shows the size of a defconfig (+ xfs, squashfs, ftrace, kprobes) > kernel image on disk for base (before any changes applied), compile (with > changes, configured for compile-time page size) and boot (with changes, > configured for boot-time page size). > > You can see the that compile-16k and 64k configs are actually slightly smaller > than the baselines; that's due to optimizing some buffer sizes which didn't need > to depend on page size during the series. The boot-time image is ~1% bigger than > the 64k compile-time image. I believe there is scope to improve this to make it > equal to compile-64k if required: > > | config | size/KB | diff/KB | diff/% | > |-------------|---------|---------|---------| > | base-4k | 54895 | 0 | 0.0% | > | base-16k | 55161 | 266 | 0.5% | > | base-64k | 56775 | 1880 | 3.4% | > | compile-4k | 54895 | 0 | 0.0% | > | compile-16k | 55097 | 202 | 0.4% | > | compile-64k | 56391 | 1496 | 2.7% | > | boot-4K | 57045 | 2150 | 3.9% | > > And below shows the size of the image in memory at run-time, separated for text > and data costs. The boot image has ~1% text cost; most likely due to the fact > that PAGE_SIZE and friends are not compile-time constants so need instructions > to load the values and do arithmetic. I believe we could eventually get the data > cost to match the cost for the compile image for the chosen page size by freeing > the ends of the static buffers not needed for the selected page size: > > | | text | text | text | data | data | data | > | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | diff/% | > |-------------|---------|---------|---------|---------|---------|---------| > | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | > | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | > | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | 9.5% | > | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | > | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | > | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | > | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | > > Functional Testing > ================== > > I've build-tested defconfig for all arches supported by tuxmake (which is most) > without issue. > > I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all page sizes > and a few va-sizes, and additionally have run all the mm-selftests, with no > regressions observed vs the equivalent compile-time page size build (although > the mm-selftests have a few existing failures when run against 16K and 64K > kernels - those should really be investigated and fixed independently). > > Test coverage is lacking for many of the drivers that I've touched, but in many > cases, I'm hoping the changes are simple enough that review might suffice? > > Performance Testing > =================== > > I've run some limited performance benchmarks: > > First, a real-world benchmark that causes a lot of page table manipulation (and > therefore we would expect to see regression here if we are going to see it > anywhere); kernel compilation. It barely registers a change. Values are times, > so smaller is better. All relative to base-4k: > > | | kern | kern | user | user | real | real | > | config | mean | stdev | mean | stdev | mean | stdev | > |-------------|---------|---------|---------|---------|---------|---------| > | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | > | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | > | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | > > The Speedometer JavaScript benchmark also shows no change. Values are runs per > min, so bigger is better. All relative to base-4k: > > | config | mean | stdev | > |-------------|---------|---------| > | base-4k | 0.0% | 0.8% | > | compile-4k | 0.4% | 0.8% | > | boot-4k | 0.0% | 0.9% | > > Finally, I've run some microbenchmarks known to stress page table manipulations > (originally from David Hildenbrand). The fork test maps/allocs 1G of anon > memory, then measures the cost of fork(). The munmap test maps/allocs 1G of anon > memory then measures the cost of munmap()ing it. The fork test is known to be > extremely sensitive to any changes that cause instructions to be aligned > differently in cachelines. When using this test for other changes, I've seen > double digit regressions for the slightest thing, so 12% regression on this test > is actually fairly good. This likely represents the extreme worst case for > regressions that will be observed across other microbenchmarks (famous last > words). Values are times, so smaller is better. All relative to base-4k: > > | | fork | fork | munmap | munmap | > | config | mean | stdev | stdev | stdev | > |-------------|---------|---------|---------|---------| > | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | > | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | > | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | > > NOTE: The series applies on top of v6.11. > > Thanks, > Ryan > > > Ryan Roberts (57): > mm: Add macros ahead of supporting boot-time page size selection > vmlinux: Align to PAGE_SIZE_MAX > mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is large > mm/page_alloc: Make page_frag_cache boot-time page size compatible > mm: Avoid split pmd ptl if pmd level is run-time folded > mm: Remove PAGE_SIZE compile-time constant assumption > fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing > fs: Remove PAGE_SIZE compile-time constant assumption > fs/nfs: Remove PAGE_SIZE compile-time constant assumption > fs/ext4: Remove PAGE_SIZE compile-time constant assumption > fork: Permit boot-time THREAD_SIZE determination > cgroup: Remove PAGE_SIZE compile-time constant assumption > bpf: Remove PAGE_SIZE compile-time constant assumption > pm/hibernate: Remove PAGE_SIZE compile-time constant assumption > stackdepot: Remove PAGE_SIZE compile-time constant assumption > perf: Remove PAGE_SIZE compile-time constant assumption > kvm: Remove PAGE_SIZE compile-time constant assumption > trace: Remove PAGE_SIZE compile-time constant assumption > crash: Remove PAGE_SIZE compile-time constant assumption > crypto: Remove PAGE_SIZE compile-time constant assumption > sunrpc: Remove PAGE_SIZE compile-time constant assumption > sound: Remove PAGE_SIZE compile-time constant assumption > net: Remove PAGE_SIZE compile-time constant assumption > net: fec: Remove PAGE_SIZE compile-time constant assumption > net: marvell: Remove PAGE_SIZE compile-time constant assumption > net: hns3: Remove PAGE_SIZE compile-time constant assumption > net: e1000: Remove PAGE_SIZE compile-time constant assumption > net: igbvf: Remove PAGE_SIZE compile-time constant assumption > net: igb: Remove PAGE_SIZE compile-time constant assumption > drivers/base: Remove PAGE_SIZE compile-time constant assumption > edac: Remove PAGE_SIZE compile-time constant assumption > optee: Remove PAGE_SIZE compile-time constant assumption > random: Remove PAGE_SIZE compile-time constant assumption > sata_sil24: Remove PAGE_SIZE compile-time constant assumption > virtio: Remove PAGE_SIZE compile-time constant assumption > xen: Remove PAGE_SIZE compile-time constant assumption > arm64: Fix macros to work in C code in addition to the linker script > arm64: Track early pgtable allocation limit > arm64: Introduce macros required for boot-time page selection > arm64: Refactor early pgtable size calculation macros > arm64: Pass desired page size on command line > arm64: Divorce early init from PAGE_SIZE > arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES > arm64: Align sections to PAGE_SIZE_MAX > arm64: Rework trampoline rodata mapping > arm64: Generalize fixmap for boot-time page size > arm64: Statically allocate and align for worst-case page size > arm64: Convert switch to if for non-const comparison values > arm64: Convert BUILD_BUG_ON to VM_BUG_ON > arm64: Remove PAGE_SZ asm-offset > arm64: Introduce cpu features for page sizes > arm64: Remove PAGE_SIZE from assembly code > arm64: Runtime-fold pmd level > arm64: Support runtime folding in idmap_kpti_install_ng_mappings > arm64: TRAMP_VALIAS is no longer compile-time constant > arm64: Determine THREAD_SIZE at boot-time > arm64: Enable boot-time page size selection > > arch/alpha/include/asm/page.h | 1 + > arch/arc/include/asm/page.h | 1 + > arch/arm/include/asm/page.h | 1 + > arch/arm64/Kconfig | 26 ++- > arch/arm64/include/asm/assembler.h | 78 ++++++- > arch/arm64/include/asm/cpufeature.h | 44 +++- > arch/arm64/include/asm/efi.h | 2 +- > arch/arm64/include/asm/fixmap.h | 28 ++- > arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- > arch/arm64/include/asm/kvm_arm.h | 21 +- > arch/arm64/include/asm/kvm_hyp.h | 11 + > arch/arm64/include/asm/kvm_pgtable.h | 6 +- > arch/arm64/include/asm/memory.h | 62 ++++-- > arch/arm64/include/asm/page-def.h | 3 +- > arch/arm64/include/asm/pgalloc.h | 16 +- > arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ > arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- > arch/arm64/include/asm/pgtable-prot.h | 2 +- > arch/arm64/include/asm/pgtable.h | 133 +++++++++--- > arch/arm64/include/asm/processor.h | 10 +- > arch/arm64/include/asm/sections.h | 1 + > arch/arm64/include/asm/smp.h | 1 + > arch/arm64/include/asm/sparsemem.h | 15 +- > arch/arm64/include/asm/sysreg.h | 54 +++-- > arch/arm64/include/asm/tlb.h | 3 + > arch/arm64/kernel/asm-offsets.c | 4 +- > arch/arm64/kernel/cpufeature.c | 93 ++++++-- > arch/arm64/kernel/efi.c | 2 +- > arch/arm64/kernel/entry.S | 60 +++++- > arch/arm64/kernel/head.S | 46 +++- > arch/arm64/kernel/hibernate-asm.S | 6 +- > arch/arm64/kernel/image-vars.h | 14 ++ > arch/arm64/kernel/image.h | 4 + > arch/arm64/kernel/pi/idreg-override.c | 68 +++++- > arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- > arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- > arch/arm64/kernel/pi/pi.h | 63 +++++- > arch/arm64/kernel/relocate_kernel.S | 10 +- > arch/arm64/kernel/vdso-wrap.S | 4 +- > arch/arm64/kernel/vdso.c | 7 +- > arch/arm64/kernel/vdso/vdso.lds.S | 4 +- > arch/arm64/kernel/vdso32-wrap.S | 4 +- > arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- > arch/arm64/kernel/vmlinux.lds.S | 48 +++-- > arch/arm64/kvm/arm.c | 10 + > arch/arm64/kvm/hyp/nvhe/Makefile | 1 + > arch/arm64/kvm/hyp/nvhe/host.S | 10 +- > arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- > arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ > arch/arm64/kvm/mmu.c | 39 ++-- > arch/arm64/lib/clear_page.S | 7 +- > arch/arm64/lib/copy_page.S | 33 ++- > arch/arm64/lib/mte.S | 27 ++- > arch/arm64/mm/Makefile | 1 + > arch/arm64/mm/fixmap.c | 38 ++-- > arch/arm64/mm/hugetlbpage.c | 40 +--- > arch/arm64/mm/init.c | 26 +-- > arch/arm64/mm/kasan_init.c | 8 +- > arch/arm64/mm/mmu.c | 53 +++-- > arch/arm64/mm/pgd.c | 12 +- > arch/arm64/mm/pgtable-geometry.c | 24 +++ > arch/arm64/mm/proc.S | 128 ++++++++--- > arch/arm64/mm/ptdump.c | 3 +- > arch/arm64/tools/cpucaps | 3 + > arch/csky/include/asm/page.h | 3 + > arch/hexagon/include/asm/page.h | 2 + > arch/loongarch/include/asm/page.h | 2 + > arch/m68k/include/asm/page.h | 1 + > arch/microblaze/include/asm/page.h | 1 + > arch/mips/include/asm/page.h | 1 + > arch/nios2/include/asm/page.h | 2 + > arch/openrisc/include/asm/page.h | 1 + > arch/parisc/include/asm/page.h | 1 + > arch/powerpc/include/asm/page.h | 2 + > arch/riscv/include/asm/page.h | 1 + > arch/s390/include/asm/page.h | 1 + > arch/sh/include/asm/page.h | 1 + > arch/sparc/include/asm/page.h | 3 + > arch/um/include/asm/page.h | 2 + > arch/x86/include/asm/page_types.h | 2 + > arch/xtensa/include/asm/page.h | 1 + > crypto/lskcipher.c | 4 +- > drivers/ata/sata_sil24.c | 46 ++-- > drivers/base/node.c | 6 +- > drivers/base/topology.c | 32 +-- > drivers/block/virtio_blk.c | 2 +- > drivers/char/random.c | 4 +- > drivers/edac/edac_mc.h | 13 +- > drivers/firmware/efi/libstub/arm64.c | 3 +- > drivers/irqchip/irq-gic-v3-its.c | 2 +- > drivers/mtd/mtdswap.c | 4 +- > drivers/net/ethernet/freescale/fec.h | 3 +- > drivers/net/ethernet/freescale/fec_main.c | 5 +- > .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- > drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- > drivers/net/ethernet/intel/igb/igb.h | 25 +-- > drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ > drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- > drivers/net/ethernet/marvell/mvneta.c | 9 +- > drivers/net/ethernet/marvell/sky2.h | 2 +- > drivers/tee/optee/call.c | 7 +- > drivers/tee/optee/smc_abi.c | 2 +- > drivers/virtio/virtio_balloon.c | 10 +- > drivers/xen/balloon.c | 11 +- > drivers/xen/biomerge.c | 12 +- > drivers/xen/privcmd.c | 2 +- > drivers/xen/xenbus/xenbus_client.c | 5 +- > drivers/xen/xlate_mmu.c | 6 +- > fs/binfmt_elf.c | 11 +- > fs/buffer.c | 2 +- > fs/coredump.c | 8 +- > fs/ext4/ext4.h | 36 ++-- > fs/ext4/move_extent.c | 2 +- > fs/ext4/readpage.c | 2 +- > fs/fat/dir.c | 4 +- > fs/fat/fatent.c | 4 +- > fs/nfs/nfs42proc.c | 2 +- > fs/nfs/nfs42xattr.c | 2 +- > fs/nfs/nfs4proc.c | 2 +- > include/asm-generic/pgtable-geometry.h | 71 +++++++ > include/asm-generic/vmlinux.lds.h | 38 ++-- > include/linux/buffer_head.h | 1 + > include/linux/cpumask.h | 5 + > include/linux/linkage.h | 4 +- > include/linux/mm.h | 17 +- > include/linux/mm_types.h | 15 +- > include/linux/mm_types_task.h | 2 +- > include/linux/mmzone.h | 3 +- > include/linux/netlink.h | 6 +- > include/linux/percpu-defs.h | 4 +- > include/linux/perf_event.h | 2 +- > include/linux/sched.h | 4 +- > include/linux/slab.h | 7 +- > include/linux/stackdepot.h | 6 +- > include/linux/sunrpc/svc.h | 8 +- > include/linux/sunrpc/svc_rdma.h | 4 +- > include/linux/sunrpc/svcsock.h | 2 +- > include/linux/swap.h | 17 +- > include/linux/swapops.h | 6 +- > include/linux/thread_info.h | 10 +- > include/xen/page.h | 2 + > init/main.c | 7 +- > kernel/bpf/core.c | 9 +- > kernel/bpf/ringbuf.c | 54 ++--- > kernel/cgroup/cgroup.c | 8 +- > kernel/crash_core.c | 2 +- > kernel/events/core.c | 2 +- > kernel/fork.c | 71 +++---- > kernel/power/power.h | 2 +- > kernel/power/snapshot.c | 2 +- > kernel/power/swap.c | 129 +++++++++-- > kernel/trace/fgraph.c | 2 +- > kernel/trace/trace.c | 2 +- > lib/stackdepot.c | 6 +- > mm/kasan/report.c | 3 +- > mm/memcontrol.c | 11 +- > mm/memory.c | 4 +- > mm/mmap.c | 2 +- > mm/page-writeback.c | 2 +- > mm/page_alloc.c | 31 +-- > mm/slub.c | 2 +- > mm/sparse.c | 2 +- > mm/swapfile.c | 2 +- > mm/vmalloc.c | 7 +- > net/9p/trans_virtio.c | 4 +- > net/core/hotdata.c | 4 +- > net/core/skbuff.c | 4 +- > net/core/sysctl_net_core.c | 2 +- > net/sunrpc/cache.c | 3 +- > net/unix/af_unix.c | 2 +- > sound/soc/soc-utils.c | 4 +- > virt/kvm/kvm_main.c | 2 +- > 172 files changed, 2185 insertions(+), 951 deletions(-) > create mode 100644 arch/arm64/include/asm/pgtable-geometry.h > create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c > create mode 100644 arch/arm64/mm/pgtable-geometry.c > create mode 100644 include/asm-generic/pgtable-geometry.h > > -- > 2.43.0 > > Hi Ryan, First off, this is excellent work! Your cover page was very detailed and made the patch set easier to understand. Some questions/comments: Once a kernel is booted with a certain page size, could there be issues if it is booted later with a different page size? How about if this is done frequently? A random example of this: Lets say a retailer, doctors office or a similar OLTP environment prefers a small page size during the day for performance reasons. Then in the off-hours prefer a large page size for DSS type workloads like running reports or batch jobs. I'm thinking how this might be used for cost savings. The best approach would be to have multiple systems/VMs/cloud instances for the different workload types. However, and end user might only have one system type and change the page size regularly as in that example. Also, the performance impact does look very minimal. It will be interesting to see if there are any effects on the larger industry standard benchmarks like TPC and SPEC. Thanks, Joe
On 18.10.24 20:15, Joseph Salisbury wrote: > > > > On 10/14/24 06:55, Ryan Roberts wrote: >> Hi All, >> >> Patch bomb incoming... This covers many subsystems, so I've included a core set >> of people on the full series and additionally included maintainers on relevant >> patches. I haven't included those maintainers on this cover letter since the >> numbers were far too big for it to work. But I've included a link to this cover >> letter on each patch, so they can hopefully find their way here. For follow up >> submissions I'll break it up by subsystem, but for now thought it was important >> to show the full picture. >> >> This RFC series implements support for boot-time page size selection within the >> arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to date, page >> size has been selected at compile-time, meaning the size is baked into a given >> kernel image. As use of larger-than-4K page sizes become more prevalent this >> starts to present a problem for distributions. Boot-time page size selection >> enables the creation of a single kernel image, which can be told which page size >> to use on the kernel command line. >> >> Why is having an image-per-page size problematic? >> ================================================= >> >> Many traditional distros are now supporting both 4K and 64K. And this means >> managing 2 kernel packages, along with drivers for each. For some, it means >> multiple installer flavours and multiple ISOs. All of this adds up to a >> less-than-ideal level of complexity. Additionally, Android now supports 4K and >> 16K kernels. I'm told having to explicitly manage their KABI for each kernel is >> painful, and the extra flash space required for both kernel images and the >> duplicated modules has been problematic. Boot-time page size selection solves >> all of this. >> >> Additionally, in starting to think about the longer term deployment story for >> D128 page tables, which Arm architecture now supports, a lot of the same >> problems need to be solved, so this work sets us up nicely for that. >> >> So what's the down side? >> ======================== >> >> Well nothing's free; Various static allocations in the kernel image must be >> sized for the worst case (largest supported page size), so image size is in line >> with size of 64K compile-time image. So if you're interested in 4K or 16K, there >> is a slight increase to the image size. But I expect that problem goes away if >> you're compressing the image - its just some extra zeros. At boot-time, I expect >> we could free the unused static storage once we know the page size - although >> that would be a follow up enhancement. >> >> And then there is performance. Since PAGE_SIZE and friends are no longer >> compile-time constants, we must look up their values and do arithmetic at >> runtime instead of compile-time. My early perf testing suggests this is >> inperceptible for real-world workloads, and only has small impact on >> microbenchmarks - more on this below. >> >> Approach >> ======== >> >> The basic idea is to rid the source of any assumptions that PAGE_SIZE and >> friends are compile-time constant, but in a way that allows the compiler to >> perform the same optimizations as was previously being done if they do turn out >> to be compile-time constant. Where constants are required, we use limits; >> PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full description >> of all the classes of problems to solve. >> >> By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to >> boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. arm64 >> does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE Kconfig, >> which is an alternative to selecting a compile-time page size. >> >> When boot-time page size is active, the arch pgtable geometry macro definitions >> resolve to something that can be configured at boot. The arm64 implementation in >> this series mainly uses global, __ro_after_init variables. I've tried using >> alternatives patching, but that performs worse than loading from memory; I think >> due to code size bloat. >> >> Status >> ====== >> >> When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only implemented enough >> to compile the kernel image itself with defconfig (and a few other bits and >> pieces). This is enough to build a kernel that can boot under QEMU or FVP. I'll >> happily do the rest of the work to enable all the extra drivers, but wanted to >> get feedback on the shape of this effort first. If anyone wants to do any >> testing, and has a must-have config, let me know and I'll prioritize enabling it >> first. >> >> The series is arranged as follows: >> >> - patch 1: Add macros required for converting non-arch code to support >> boot-time page size selection >> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from all >> non-arch code >> - patches 37-38: Some arm64 tidy ups >> - patch 39: Add macros required for converting arm64 code to support >> boot-time page size selection >> - patches 40-56: arm64 changes to support boot-time page size selection >> - patch 57: Add arm64 Kconfig option to enable boot-time page size >> selection >> >> Ideally, I'd like to get the basics merged (something like this series), then >> incrementally improve it over a handful of kernel releases until we can >> demonstrate that we have feature parity with the compile-time build and no >> performance blockers. Once at that point, ideally the compile-time build options >> would be removed and the code could be cleaned up further. >> >> One of the bigger peices that I'd propose to add as a follow up, is to make >> va-size boot-time selectable too. That will greatly simplify LPA2 fallback >> handling. >> >> Assuming people are ammenable to the rough shape, how would I go about getting >> the non-arch changes merged? Since they cover many subsystems, will each piece >> need to go independently to each relevant maintainer or could it all be merged >> together through the arm64 tree? >> >> Image Size >> ========== >> >> The below shows the size of a defconfig (+ xfs, squashfs, ftrace, kprobes) >> kernel image on disk for base (before any changes applied), compile (with >> changes, configured for compile-time page size) and boot (with changes, >> configured for boot-time page size). >> >> You can see the that compile-16k and 64k configs are actually slightly smaller >> than the baselines; that's due to optimizing some buffer sizes which didn't need >> to depend on page size during the series. The boot-time image is ~1% bigger than >> the 64k compile-time image. I believe there is scope to improve this to make it >> equal to compile-64k if required: >> >> | config | size/KB | diff/KB | diff/% | >> |-------------|---------|---------|---------| >> | base-4k | 54895 | 0 | 0.0% | >> | base-16k | 55161 | 266 | 0.5% | >> | base-64k | 56775 | 1880 | 3.4% | >> | compile-4k | 54895 | 0 | 0.0% | >> | compile-16k | 55097 | 202 | 0.4% | >> | compile-64k | 56391 | 1496 | 2.7% | >> | boot-4K | 57045 | 2150 | 3.9% | >> >> And below shows the size of the image in memory at run-time, separated for text >> and data costs. The boot image has ~1% text cost; most likely due to the fact >> that PAGE_SIZE and friends are not compile-time constants so need instructions >> to load the values and do arithmetic. I believe we could eventually get the data >> cost to match the cost for the compile image for the chosen page size by freeing >> the ends of the static buffers not needed for the selected page size: >> >> | | text | text | text | data | data | data | >> | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | diff/% | >> |-------------|---------|---------|---------|---------|---------|---------| >> | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | >> | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | >> | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | 9.5% | >> | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | >> | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | >> | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | >> | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | >> >> Functional Testing >> ================== >> >> I've build-tested defconfig for all arches supported by tuxmake (which is most) >> without issue. >> >> I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all page sizes >> and a few va-sizes, and additionally have run all the mm-selftests, with no >> regressions observed vs the equivalent compile-time page size build (although >> the mm-selftests have a few existing failures when run against 16K and 64K >> kernels - those should really be investigated and fixed independently). >> >> Test coverage is lacking for many of the drivers that I've touched, but in many >> cases, I'm hoping the changes are simple enough that review might suffice? >> >> Performance Testing >> =================== >> >> I've run some limited performance benchmarks: >> >> First, a real-world benchmark that causes a lot of page table manipulation (and >> therefore we would expect to see regression here if we are going to see it >> anywhere); kernel compilation. It barely registers a change. Values are times, >> so smaller is better. All relative to base-4k: >> >> | | kern | kern | user | user | real | real | >> | config | mean | stdev | mean | stdev | mean | stdev | >> |-------------|---------|---------|---------|---------|---------|---------| >> | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | >> | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | >> | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | >> >> The Speedometer JavaScript benchmark also shows no change. Values are runs per >> min, so bigger is better. All relative to base-4k: >> >> | config | mean | stdev | >> |-------------|---------|---------| >> | base-4k | 0.0% | 0.8% | >> | compile-4k | 0.4% | 0.8% | >> | boot-4k | 0.0% | 0.9% | >> >> Finally, I've run some microbenchmarks known to stress page table manipulations >> (originally from David Hildenbrand). The fork test maps/allocs 1G of anon >> memory, then measures the cost of fork(). The munmap test maps/allocs 1G of anon >> memory then measures the cost of munmap()ing it. The fork test is known to be >> extremely sensitive to any changes that cause instructions to be aligned >> differently in cachelines. When using this test for other changes, I've seen >> double digit regressions for the slightest thing, so 12% regression on this test >> is actually fairly good. This likely represents the extreme worst case for >> regressions that will be observed across other microbenchmarks (famous last >> words). Values are times, so smaller is better. All relative to base-4k: >> >> | | fork | fork | munmap | munmap | >> | config | mean | stdev | stdev | stdev | >> |-------------|---------|---------|---------|---------| >> | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | >> | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | >> | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | >> >> NOTE: The series applies on top of v6.11. >> >> Thanks, >> Ryan >> >> >> Ryan Roberts (57): >> mm: Add macros ahead of supporting boot-time page size selection >> vmlinux: Align to PAGE_SIZE_MAX >> mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is large >> mm/page_alloc: Make page_frag_cache boot-time page size compatible >> mm: Avoid split pmd ptl if pmd level is run-time folded >> mm: Remove PAGE_SIZE compile-time constant assumption >> fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing >> fs: Remove PAGE_SIZE compile-time constant assumption >> fs/nfs: Remove PAGE_SIZE compile-time constant assumption >> fs/ext4: Remove PAGE_SIZE compile-time constant assumption >> fork: Permit boot-time THREAD_SIZE determination >> cgroup: Remove PAGE_SIZE compile-time constant assumption >> bpf: Remove PAGE_SIZE compile-time constant assumption >> pm/hibernate: Remove PAGE_SIZE compile-time constant assumption >> stackdepot: Remove PAGE_SIZE compile-time constant assumption >> perf: Remove PAGE_SIZE compile-time constant assumption >> kvm: Remove PAGE_SIZE compile-time constant assumption >> trace: Remove PAGE_SIZE compile-time constant assumption >> crash: Remove PAGE_SIZE compile-time constant assumption >> crypto: Remove PAGE_SIZE compile-time constant assumption >> sunrpc: Remove PAGE_SIZE compile-time constant assumption >> sound: Remove PAGE_SIZE compile-time constant assumption >> net: Remove PAGE_SIZE compile-time constant assumption >> net: fec: Remove PAGE_SIZE compile-time constant assumption >> net: marvell: Remove PAGE_SIZE compile-time constant assumption >> net: hns3: Remove PAGE_SIZE compile-time constant assumption >> net: e1000: Remove PAGE_SIZE compile-time constant assumption >> net: igbvf: Remove PAGE_SIZE compile-time constant assumption >> net: igb: Remove PAGE_SIZE compile-time constant assumption >> drivers/base: Remove PAGE_SIZE compile-time constant assumption >> edac: Remove PAGE_SIZE compile-time constant assumption >> optee: Remove PAGE_SIZE compile-time constant assumption >> random: Remove PAGE_SIZE compile-time constant assumption >> sata_sil24: Remove PAGE_SIZE compile-time constant assumption >> virtio: Remove PAGE_SIZE compile-time constant assumption >> xen: Remove PAGE_SIZE compile-time constant assumption >> arm64: Fix macros to work in C code in addition to the linker script >> arm64: Track early pgtable allocation limit >> arm64: Introduce macros required for boot-time page selection >> arm64: Refactor early pgtable size calculation macros >> arm64: Pass desired page size on command line >> arm64: Divorce early init from PAGE_SIZE >> arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES >> arm64: Align sections to PAGE_SIZE_MAX >> arm64: Rework trampoline rodata mapping >> arm64: Generalize fixmap for boot-time page size >> arm64: Statically allocate and align for worst-case page size >> arm64: Convert switch to if for non-const comparison values >> arm64: Convert BUILD_BUG_ON to VM_BUG_ON >> arm64: Remove PAGE_SZ asm-offset >> arm64: Introduce cpu features for page sizes >> arm64: Remove PAGE_SIZE from assembly code >> arm64: Runtime-fold pmd level >> arm64: Support runtime folding in idmap_kpti_install_ng_mappings >> arm64: TRAMP_VALIAS is no longer compile-time constant >> arm64: Determine THREAD_SIZE at boot-time >> arm64: Enable boot-time page size selection >> >> arch/alpha/include/asm/page.h | 1 + >> arch/arc/include/asm/page.h | 1 + >> arch/arm/include/asm/page.h | 1 + >> arch/arm64/Kconfig | 26 ++- >> arch/arm64/include/asm/assembler.h | 78 ++++++- >> arch/arm64/include/asm/cpufeature.h | 44 +++- >> arch/arm64/include/asm/efi.h | 2 +- >> arch/arm64/include/asm/fixmap.h | 28 ++- >> arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- >> arch/arm64/include/asm/kvm_arm.h | 21 +- >> arch/arm64/include/asm/kvm_hyp.h | 11 + >> arch/arm64/include/asm/kvm_pgtable.h | 6 +- >> arch/arm64/include/asm/memory.h | 62 ++++-- >> arch/arm64/include/asm/page-def.h | 3 +- >> arch/arm64/include/asm/pgalloc.h | 16 +- >> arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ >> arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- >> arch/arm64/include/asm/pgtable-prot.h | 2 +- >> arch/arm64/include/asm/pgtable.h | 133 +++++++++--- >> arch/arm64/include/asm/processor.h | 10 +- >> arch/arm64/include/asm/sections.h | 1 + >> arch/arm64/include/asm/smp.h | 1 + >> arch/arm64/include/asm/sparsemem.h | 15 +- >> arch/arm64/include/asm/sysreg.h | 54 +++-- >> arch/arm64/include/asm/tlb.h | 3 + >> arch/arm64/kernel/asm-offsets.c | 4 +- >> arch/arm64/kernel/cpufeature.c | 93 ++++++-- >> arch/arm64/kernel/efi.c | 2 +- >> arch/arm64/kernel/entry.S | 60 +++++- >> arch/arm64/kernel/head.S | 46 +++- >> arch/arm64/kernel/hibernate-asm.S | 6 +- >> arch/arm64/kernel/image-vars.h | 14 ++ >> arch/arm64/kernel/image.h | 4 + >> arch/arm64/kernel/pi/idreg-override.c | 68 +++++- >> arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- >> arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- >> arch/arm64/kernel/pi/pi.h | 63 +++++- >> arch/arm64/kernel/relocate_kernel.S | 10 +- >> arch/arm64/kernel/vdso-wrap.S | 4 +- >> arch/arm64/kernel/vdso.c | 7 +- >> arch/arm64/kernel/vdso/vdso.lds.S | 4 +- >> arch/arm64/kernel/vdso32-wrap.S | 4 +- >> arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- >> arch/arm64/kernel/vmlinux.lds.S | 48 +++-- >> arch/arm64/kvm/arm.c | 10 + >> arch/arm64/kvm/hyp/nvhe/Makefile | 1 + >> arch/arm64/kvm/hyp/nvhe/host.S | 10 +- >> arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- >> arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ >> arch/arm64/kvm/mmu.c | 39 ++-- >> arch/arm64/lib/clear_page.S | 7 +- >> arch/arm64/lib/copy_page.S | 33 ++- >> arch/arm64/lib/mte.S | 27 ++- >> arch/arm64/mm/Makefile | 1 + >> arch/arm64/mm/fixmap.c | 38 ++-- >> arch/arm64/mm/hugetlbpage.c | 40 +--- >> arch/arm64/mm/init.c | 26 +-- >> arch/arm64/mm/kasan_init.c | 8 +- >> arch/arm64/mm/mmu.c | 53 +++-- >> arch/arm64/mm/pgd.c | 12 +- >> arch/arm64/mm/pgtable-geometry.c | 24 +++ >> arch/arm64/mm/proc.S | 128 ++++++++--- >> arch/arm64/mm/ptdump.c | 3 +- >> arch/arm64/tools/cpucaps | 3 + >> arch/csky/include/asm/page.h | 3 + >> arch/hexagon/include/asm/page.h | 2 + >> arch/loongarch/include/asm/page.h | 2 + >> arch/m68k/include/asm/page.h | 1 + >> arch/microblaze/include/asm/page.h | 1 + >> arch/mips/include/asm/page.h | 1 + >> arch/nios2/include/asm/page.h | 2 + >> arch/openrisc/include/asm/page.h | 1 + >> arch/parisc/include/asm/page.h | 1 + >> arch/powerpc/include/asm/page.h | 2 + >> arch/riscv/include/asm/page.h | 1 + >> arch/s390/include/asm/page.h | 1 + >> arch/sh/include/asm/page.h | 1 + >> arch/sparc/include/asm/page.h | 3 + >> arch/um/include/asm/page.h | 2 + >> arch/x86/include/asm/page_types.h | 2 + >> arch/xtensa/include/asm/page.h | 1 + >> crypto/lskcipher.c | 4 +- >> drivers/ata/sata_sil24.c | 46 ++-- >> drivers/base/node.c | 6 +- >> drivers/base/topology.c | 32 +-- >> drivers/block/virtio_blk.c | 2 +- >> drivers/char/random.c | 4 +- >> drivers/edac/edac_mc.h | 13 +- >> drivers/firmware/efi/libstub/arm64.c | 3 +- >> drivers/irqchip/irq-gic-v3-its.c | 2 +- >> drivers/mtd/mtdswap.c | 4 +- >> drivers/net/ethernet/freescale/fec.h | 3 +- >> drivers/net/ethernet/freescale/fec_main.c | 5 +- >> .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- >> drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- >> drivers/net/ethernet/intel/igb/igb.h | 25 +-- >> drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ >> drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- >> drivers/net/ethernet/marvell/mvneta.c | 9 +- >> drivers/net/ethernet/marvell/sky2.h | 2 +- >> drivers/tee/optee/call.c | 7 +- >> drivers/tee/optee/smc_abi.c | 2 +- >> drivers/virtio/virtio_balloon.c | 10 +- >> drivers/xen/balloon.c | 11 +- >> drivers/xen/biomerge.c | 12 +- >> drivers/xen/privcmd.c | 2 +- >> drivers/xen/xenbus/xenbus_client.c | 5 +- >> drivers/xen/xlate_mmu.c | 6 +- >> fs/binfmt_elf.c | 11 +- >> fs/buffer.c | 2 +- >> fs/coredump.c | 8 +- >> fs/ext4/ext4.h | 36 ++-- >> fs/ext4/move_extent.c | 2 +- >> fs/ext4/readpage.c | 2 +- >> fs/fat/dir.c | 4 +- >> fs/fat/fatent.c | 4 +- >> fs/nfs/nfs42proc.c | 2 +- >> fs/nfs/nfs42xattr.c | 2 +- >> fs/nfs/nfs4proc.c | 2 +- >> include/asm-generic/pgtable-geometry.h | 71 +++++++ >> include/asm-generic/vmlinux.lds.h | 38 ++-- >> include/linux/buffer_head.h | 1 + >> include/linux/cpumask.h | 5 + >> include/linux/linkage.h | 4 +- >> include/linux/mm.h | 17 +- >> include/linux/mm_types.h | 15 +- >> include/linux/mm_types_task.h | 2 +- >> include/linux/mmzone.h | 3 +- >> include/linux/netlink.h | 6 +- >> include/linux/percpu-defs.h | 4 +- >> include/linux/perf_event.h | 2 +- >> include/linux/sched.h | 4 +- >> include/linux/slab.h | 7 +- >> include/linux/stackdepot.h | 6 +- >> include/linux/sunrpc/svc.h | 8 +- >> include/linux/sunrpc/svc_rdma.h | 4 +- >> include/linux/sunrpc/svcsock.h | 2 +- >> include/linux/swap.h | 17 +- >> include/linux/swapops.h | 6 +- >> include/linux/thread_info.h | 10 +- >> include/xen/page.h | 2 + >> init/main.c | 7 +- >> kernel/bpf/core.c | 9 +- >> kernel/bpf/ringbuf.c | 54 ++--- >> kernel/cgroup/cgroup.c | 8 +- >> kernel/crash_core.c | 2 +- >> kernel/events/core.c | 2 +- >> kernel/fork.c | 71 +++---- >> kernel/power/power.h | 2 +- >> kernel/power/snapshot.c | 2 +- >> kernel/power/swap.c | 129 +++++++++-- >> kernel/trace/fgraph.c | 2 +- >> kernel/trace/trace.c | 2 +- >> lib/stackdepot.c | 6 +- >> mm/kasan/report.c | 3 +- >> mm/memcontrol.c | 11 +- >> mm/memory.c | 4 +- >> mm/mmap.c | 2 +- >> mm/page-writeback.c | 2 +- >> mm/page_alloc.c | 31 +-- >> mm/slub.c | 2 +- >> mm/sparse.c | 2 +- >> mm/swapfile.c | 2 +- >> mm/vmalloc.c | 7 +- >> net/9p/trans_virtio.c | 4 +- >> net/core/hotdata.c | 4 +- >> net/core/skbuff.c | 4 +- >> net/core/sysctl_net_core.c | 2 +- >> net/sunrpc/cache.c | 3 +- >> net/unix/af_unix.c | 2 +- >> sound/soc/soc-utils.c | 4 +- >> virt/kvm/kvm_main.c | 2 +- >> 172 files changed, 2185 insertions(+), 951 deletions(-) >> create mode 100644 arch/arm64/include/asm/pgtable-geometry.h >> create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c >> create mode 100644 arch/arm64/mm/pgtable-geometry.c >> create mode 100644 include/asm-generic/pgtable-geometry.h >> >> -- >> 2.43.0 >> >> > > Hi Ryan, > > First off, this is excellent work! Your cover page was very detailed > and made the patch set easier to understand. > > Some questions/comments: > > Once a kernel is booted with a certain page size, could there be issues > if it is booted later with a different page size? How about if this is > done frequently? I think that is the reason why you are only given the option in RHEL to select the kernel (4K vs. 64K) to use at install time. Software can easily use a different data format for persistance based on the base page size. I would suspect DBs might be the usual suspects. One example is swap space I think, where the base page size used when formatting the device is used, and it cannot be used with a different page size unless reformatting it. So ... one has to be a bit careful ... -- Cheers, David / dhildenb
On 10/18/24 14:27, David Hildenbrand wrote: > On 18.10.24 20:15, Joseph Salisbury wrote: >> >> >> >> On 10/14/24 06:55, Ryan Roberts wrote: >>> Hi All, >>> >>> Patch bomb incoming... This covers many subsystems, so I've included >>> a core set >>> of people on the full series and additionally included maintainers >>> on relevant >>> patches. I haven't included those maintainers on this cover letter >>> since the >>> numbers were far too big for it to work. But I've included a link to >>> this cover >>> letter on each patch, so they can hopefully find their way here. For >>> follow up >>> submissions I'll break it up by subsystem, but for now thought it >>> was important >>> to show the full picture. >>> >>> This RFC series implements support for boot-time page size selection >>> within the >>> arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but >>> to date, page >>> size has been selected at compile-time, meaning the size is baked >>> into a given >>> kernel image. As use of larger-than-4K page sizes become more >>> prevalent this >>> starts to present a problem for distributions. Boot-time page size >>> selection >>> enables the creation of a single kernel image, which can be told >>> which page size >>> to use on the kernel command line. >>> >>> Why is having an image-per-page size problematic? >>> ================================================= >>> >>> Many traditional distros are now supporting both 4K and 64K. And >>> this means >>> managing 2 kernel packages, along with drivers for each. For some, >>> it means >>> multiple installer flavours and multiple ISOs. All of this adds up to a >>> less-than-ideal level of complexity. Additionally, Android now >>> supports 4K and >>> 16K kernels. I'm told having to explicitly manage their KABI for >>> each kernel is >>> painful, and the extra flash space required for both kernel images >>> and the >>> duplicated modules has been problematic. Boot-time page size >>> selection solves >>> all of this. >>> >>> Additionally, in starting to think about the longer term deployment >>> story for >>> D128 page tables, which Arm architecture now supports, a lot of the >>> same >>> problems need to be solved, so this work sets us up nicely for that. >>> >>> So what's the down side? >>> ======================== >>> >>> Well nothing's free; Various static allocations in the kernel image >>> must be >>> sized for the worst case (largest supported page size), so image >>> size is in line >>> with size of 64K compile-time image. So if you're interested in 4K >>> or 16K, there >>> is a slight increase to the image size. But I expect that problem >>> goes away if >>> you're compressing the image - its just some extra zeros. At >>> boot-time, I expect >>> we could free the unused static storage once we know the page size - >>> although >>> that would be a follow up enhancement. >>> >>> And then there is performance. Since PAGE_SIZE and friends are no >>> longer >>> compile-time constants, we must look up their values and do >>> arithmetic at >>> runtime instead of compile-time. My early perf testing suggests this is >>> inperceptible for real-world workloads, and only has small impact on >>> microbenchmarks - more on this below. >>> >>> Approach >>> ======== >>> >>> The basic idea is to rid the source of any assumptions that >>> PAGE_SIZE and >>> friends are compile-time constant, but in a way that allows the >>> compiler to >>> perform the same optimizations as was previously being done if they >>> do turn out >>> to be compile-time constant. Where constants are required, we use >>> limits; >>> PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full >>> description >>> of all the classes of problems to solve. >>> >>> By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may >>> opt-in to >>> boot-time page size selection by defining PAGE_SIZE_MIN & >>> PAGE_SIZE_MAX. arm64 >>> does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE >>> Kconfig, >>> which is an alternative to selecting a compile-time page size. >>> >>> When boot-time page size is active, the arch pgtable geometry macro >>> definitions >>> resolve to something that can be configured at boot. The arm64 >>> implementation in >>> this series mainly uses global, __ro_after_init variables. I've >>> tried using >>> alternatives patching, but that performs worse than loading from >>> memory; I think >>> due to code size bloat. >>> >>> Status >>> ====== >>> >>> When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only >>> implemented enough >>> to compile the kernel image itself with defconfig (and a few other >>> bits and >>> pieces). This is enough to build a kernel that can boot under QEMU >>> or FVP. I'll >>> happily do the rest of the work to enable all the extra drivers, but >>> wanted to >>> get feedback on the shape of this effort first. If anyone wants to >>> do any >>> testing, and has a must-have config, let me know and I'll prioritize >>> enabling it >>> first. >>> >>> The series is arranged as follows: >>> >>> - patch 1: Add macros required for converting non-arch >>> code to support >>> boot-time page size selection >>> - patches 2-36: Remove PAGE_SIZE compile-time constant >>> assumption from all >>> non-arch code >>> - patches 37-38: Some arm64 tidy ups >>> - patch 39: Add macros required for converting arm64 code >>> to support >>> boot-time page size selection >>> - patches 40-56: arm64 changes to support boot-time page size >>> selection >>> - patch 57: Add arm64 Kconfig option to enable boot-time >>> page size >>> selection >>> >>> Ideally, I'd like to get the basics merged (something like this >>> series), then >>> incrementally improve it over a handful of kernel releases until we can >>> demonstrate that we have feature parity with the compile-time build >>> and no >>> performance blockers. Once at that point, ideally the compile-time >>> build options >>> would be removed and the code could be cleaned up further. >>> >>> One of the bigger peices that I'd propose to add as a follow up, is >>> to make >>> va-size boot-time selectable too. That will greatly simplify LPA2 >>> fallback >>> handling. >>> >>> Assuming people are ammenable to the rough shape, how would I go >>> about getting >>> the non-arch changes merged? Since they cover many subsystems, will >>> each piece >>> need to go independently to each relevant maintainer or could it all >>> be merged >>> together through the arm64 tree? >>> >>> Image Size >>> ========== >>> >>> The below shows the size of a defconfig (+ xfs, squashfs, ftrace, >>> kprobes) >>> kernel image on disk for base (before any changes applied), compile >>> (with >>> changes, configured for compile-time page size) and boot (with changes, >>> configured for boot-time page size). >>> >>> You can see the that compile-16k and 64k configs are actually >>> slightly smaller >>> than the baselines; that's due to optimizing some buffer sizes which >>> didn't need >>> to depend on page size during the series. The boot-time image is ~1% >>> bigger than >>> the 64k compile-time image. I believe there is scope to improve this >>> to make it >>> equal to compile-64k if required: >>> >>> | config | size/KB | diff/KB | diff/% | >>> |-------------|---------|---------|---------| >>> | base-4k | 54895 | 0 | 0.0% | >>> | base-16k | 55161 | 266 | 0.5% | >>> | base-64k | 56775 | 1880 | 3.4% | >>> | compile-4k | 54895 | 0 | 0.0% | >>> | compile-16k | 55097 | 202 | 0.4% | >>> | compile-64k | 56391 | 1496 | 2.7% | >>> | boot-4K | 57045 | 2150 | 3.9% | >>> >>> And below shows the size of the image in memory at run-time, >>> separated for text >>> and data costs. The boot image has ~1% text cost; most likely due to >>> the fact >>> that PAGE_SIZE and friends are not compile-time constants so need >>> instructions >>> to load the values and do arithmetic. I believe we could eventually >>> get the data >>> cost to match the cost for the compile image for the chosen page >>> size by freeing >>> the ends of the static buffers not needed for the selected page size: >>> >>> | | text | text | text | data | data | >>> data | >>> | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | >>> diff/% | >>> |-------------|---------|---------|---------|---------|---------|---------| >>> >>> | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | >>> | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | >>> | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | >>> 9.5% | >>> | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | >>> | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | >>> | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | >>> | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | >>> >>> Functional Testing >>> ================== >>> >>> I've build-tested defconfig for all arches supported by tuxmake >>> (which is most) >>> without issue. >>> >>> I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all >>> page sizes >>> and a few va-sizes, and additionally have run all the mm-selftests, >>> with no >>> regressions observed vs the equivalent compile-time page size build >>> (although >>> the mm-selftests have a few existing failures when run against 16K >>> and 64K >>> kernels - those should really be investigated and fixed independently). >>> >>> Test coverage is lacking for many of the drivers that I've touched, >>> but in many >>> cases, I'm hoping the changes are simple enough that review might >>> suffice? >>> >>> Performance Testing >>> =================== >>> >>> I've run some limited performance benchmarks: >>> >>> First, a real-world benchmark that causes a lot of page table >>> manipulation (and >>> therefore we would expect to see regression here if we are going to >>> see it >>> anywhere); kernel compilation. It barely registers a change. Values >>> are times, >>> so smaller is better. All relative to base-4k: >>> >>> | | kern | kern | user | user | real | >>> real | >>> | config | mean | stdev | mean | stdev | mean | >>> stdev | >>> |-------------|---------|---------|---------|---------|---------|---------| >>> >>> | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | >>> 0.3% | >>> | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | >>> 0.3% | >>> | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | >>> 0.2% | >>> >>> The Speedometer JavaScript benchmark also shows no change. Values >>> are runs per >>> min, so bigger is better. All relative to base-4k: >>> >>> | config | mean | stdev | >>> |-------------|---------|---------| >>> | base-4k | 0.0% | 0.8% | >>> | compile-4k | 0.4% | 0.8% | >>> | boot-4k | 0.0% | 0.9% | >>> >>> Finally, I've run some microbenchmarks known to stress page table >>> manipulations >>> (originally from David Hildenbrand). The fork test maps/allocs 1G of >>> anon >>> memory, then measures the cost of fork(). The munmap test >>> maps/allocs 1G of anon >>> memory then measures the cost of munmap()ing it. The fork test is >>> known to be >>> extremely sensitive to any changes that cause instructions to be >>> aligned >>> differently in cachelines. When using this test for other changes, >>> I've seen >>> double digit regressions for the slightest thing, so 12% regression >>> on this test >>> is actually fairly good. This likely represents the extreme worst >>> case for >>> regressions that will be observed across other microbenchmarks >>> (famous last >>> words). Values are times, so smaller is better. All relative to >>> base-4k: >>> >>> | | fork | fork | munmap | munmap | >>> | config | mean | stdev | stdev | stdev | >>> |-------------|---------|---------|---------|---------| >>> | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | >>> | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | >>> | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | >>> >>> NOTE: The series applies on top of v6.11. >>> >>> Thanks, >>> Ryan >>> >>> >>> Ryan Roberts (57): >>> mm: Add macros ahead of supporting boot-time page size selection >>> vmlinux: Align to PAGE_SIZE_MAX >>> mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is >>> large >>> mm/page_alloc: Make page_frag_cache boot-time page size compatible >>> mm: Avoid split pmd ptl if pmd level is run-time folded >>> mm: Remove PAGE_SIZE compile-time constant assumption >>> fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing >>> fs: Remove PAGE_SIZE compile-time constant assumption >>> fs/nfs: Remove PAGE_SIZE compile-time constant assumption >>> fs/ext4: Remove PAGE_SIZE compile-time constant assumption >>> fork: Permit boot-time THREAD_SIZE determination >>> cgroup: Remove PAGE_SIZE compile-time constant assumption >>> bpf: Remove PAGE_SIZE compile-time constant assumption >>> pm/hibernate: Remove PAGE_SIZE compile-time constant assumption >>> stackdepot: Remove PAGE_SIZE compile-time constant assumption >>> perf: Remove PAGE_SIZE compile-time constant assumption >>> kvm: Remove PAGE_SIZE compile-time constant assumption >>> trace: Remove PAGE_SIZE compile-time constant assumption >>> crash: Remove PAGE_SIZE compile-time constant assumption >>> crypto: Remove PAGE_SIZE compile-time constant assumption >>> sunrpc: Remove PAGE_SIZE compile-time constant assumption >>> sound: Remove PAGE_SIZE compile-time constant assumption >>> net: Remove PAGE_SIZE compile-time constant assumption >>> net: fec: Remove PAGE_SIZE compile-time constant assumption >>> net: marvell: Remove PAGE_SIZE compile-time constant assumption >>> net: hns3: Remove PAGE_SIZE compile-time constant assumption >>> net: e1000: Remove PAGE_SIZE compile-time constant assumption >>> net: igbvf: Remove PAGE_SIZE compile-time constant assumption >>> net: igb: Remove PAGE_SIZE compile-time constant assumption >>> drivers/base: Remove PAGE_SIZE compile-time constant assumption >>> edac: Remove PAGE_SIZE compile-time constant assumption >>> optee: Remove PAGE_SIZE compile-time constant assumption >>> random: Remove PAGE_SIZE compile-time constant assumption >>> sata_sil24: Remove PAGE_SIZE compile-time constant assumption >>> virtio: Remove PAGE_SIZE compile-time constant assumption >>> xen: Remove PAGE_SIZE compile-time constant assumption >>> arm64: Fix macros to work in C code in addition to the linker >>> script >>> arm64: Track early pgtable allocation limit >>> arm64: Introduce macros required for boot-time page selection >>> arm64: Refactor early pgtable size calculation macros >>> arm64: Pass desired page size on command line >>> arm64: Divorce early init from PAGE_SIZE >>> arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES >>> arm64: Align sections to PAGE_SIZE_MAX >>> arm64: Rework trampoline rodata mapping >>> arm64: Generalize fixmap for boot-time page size >>> arm64: Statically allocate and align for worst-case page size >>> arm64: Convert switch to if for non-const comparison values >>> arm64: Convert BUILD_BUG_ON to VM_BUG_ON >>> arm64: Remove PAGE_SZ asm-offset >>> arm64: Introduce cpu features for page sizes >>> arm64: Remove PAGE_SIZE from assembly code >>> arm64: Runtime-fold pmd level >>> arm64: Support runtime folding in idmap_kpti_install_ng_mappings >>> arm64: TRAMP_VALIAS is no longer compile-time constant >>> arm64: Determine THREAD_SIZE at boot-time >>> arm64: Enable boot-time page size selection >>> >>> arch/alpha/include/asm/page.h | 1 + >>> arch/arc/include/asm/page.h | 1 + >>> arch/arm/include/asm/page.h | 1 + >>> arch/arm64/Kconfig | 26 ++- >>> arch/arm64/include/asm/assembler.h | 78 ++++++- >>> arch/arm64/include/asm/cpufeature.h | 44 +++- >>> arch/arm64/include/asm/efi.h | 2 +- >>> arch/arm64/include/asm/fixmap.h | 28 ++- >>> arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- >>> arch/arm64/include/asm/kvm_arm.h | 21 +- >>> arch/arm64/include/asm/kvm_hyp.h | 11 + >>> arch/arm64/include/asm/kvm_pgtable.h | 6 +- >>> arch/arm64/include/asm/memory.h | 62 ++++-- >>> arch/arm64/include/asm/page-def.h | 3 +- >>> arch/arm64/include/asm/pgalloc.h | 16 +- >>> arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ >>> arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- >>> arch/arm64/include/asm/pgtable-prot.h | 2 +- >>> arch/arm64/include/asm/pgtable.h | 133 +++++++++--- >>> arch/arm64/include/asm/processor.h | 10 +- >>> arch/arm64/include/asm/sections.h | 1 + >>> arch/arm64/include/asm/smp.h | 1 + >>> arch/arm64/include/asm/sparsemem.h | 15 +- >>> arch/arm64/include/asm/sysreg.h | 54 +++-- >>> arch/arm64/include/asm/tlb.h | 3 + >>> arch/arm64/kernel/asm-offsets.c | 4 +- >>> arch/arm64/kernel/cpufeature.c | 93 ++++++-- >>> arch/arm64/kernel/efi.c | 2 +- >>> arch/arm64/kernel/entry.S | 60 +++++- >>> arch/arm64/kernel/head.S | 46 +++- >>> arch/arm64/kernel/hibernate-asm.S | 6 +- >>> arch/arm64/kernel/image-vars.h | 14 ++ >>> arch/arm64/kernel/image.h | 4 + >>> arch/arm64/kernel/pi/idreg-override.c | 68 +++++- >>> arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- >>> arch/arm64/kernel/pi/map_range.c | 201 >>> ++++++++++++++++-- >>> arch/arm64/kernel/pi/pi.h | 63 +++++- >>> arch/arm64/kernel/relocate_kernel.S | 10 +- >>> arch/arm64/kernel/vdso-wrap.S | 4 +- >>> arch/arm64/kernel/vdso.c | 7 +- >>> arch/arm64/kernel/vdso/vdso.lds.S | 4 +- >>> arch/arm64/kernel/vdso32-wrap.S | 4 +- >>> arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- >>> arch/arm64/kernel/vmlinux.lds.S | 48 +++-- >>> arch/arm64/kvm/arm.c | 10 + >>> arch/arm64/kvm/hyp/nvhe/Makefile | 1 + >>> arch/arm64/kvm/hyp/nvhe/host.S | 10 +- >>> arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- >>> arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ >>> arch/arm64/kvm/mmu.c | 39 ++-- >>> arch/arm64/lib/clear_page.S | 7 +- >>> arch/arm64/lib/copy_page.S | 33 ++- >>> arch/arm64/lib/mte.S | 27 ++- >>> arch/arm64/mm/Makefile | 1 + >>> arch/arm64/mm/fixmap.c | 38 ++-- >>> arch/arm64/mm/hugetlbpage.c | 40 +--- >>> arch/arm64/mm/init.c | 26 +-- >>> arch/arm64/mm/kasan_init.c | 8 +- >>> arch/arm64/mm/mmu.c | 53 +++-- >>> arch/arm64/mm/pgd.c | 12 +- >>> arch/arm64/mm/pgtable-geometry.c | 24 +++ >>> arch/arm64/mm/proc.S | 128 ++++++++--- >>> arch/arm64/mm/ptdump.c | 3 +- >>> arch/arm64/tools/cpucaps | 3 + >>> arch/csky/include/asm/page.h | 3 + >>> arch/hexagon/include/asm/page.h | 2 + >>> arch/loongarch/include/asm/page.h | 2 + >>> arch/m68k/include/asm/page.h | 1 + >>> arch/microblaze/include/asm/page.h | 1 + >>> arch/mips/include/asm/page.h | 1 + >>> arch/nios2/include/asm/page.h | 2 + >>> arch/openrisc/include/asm/page.h | 1 + >>> arch/parisc/include/asm/page.h | 1 + >>> arch/powerpc/include/asm/page.h | 2 + >>> arch/riscv/include/asm/page.h | 1 + >>> arch/s390/include/asm/page.h | 1 + >>> arch/sh/include/asm/page.h | 1 + >>> arch/sparc/include/asm/page.h | 3 + >>> arch/um/include/asm/page.h | 2 + >>> arch/x86/include/asm/page_types.h | 2 + >>> arch/xtensa/include/asm/page.h | 1 + >>> crypto/lskcipher.c | 4 +- >>> drivers/ata/sata_sil24.c | 46 ++-- >>> drivers/base/node.c | 6 +- >>> drivers/base/topology.c | 32 +-- >>> drivers/block/virtio_blk.c | 2 +- >>> drivers/char/random.c | 4 +- >>> drivers/edac/edac_mc.h | 13 +- >>> drivers/firmware/efi/libstub/arm64.c | 3 +- >>> drivers/irqchip/irq-gic-v3-its.c | 2 +- >>> drivers/mtd/mtdswap.c | 4 +- >>> drivers/net/ethernet/freescale/fec.h | 3 +- >>> drivers/net/ethernet/freescale/fec_main.c | 5 +- >>> .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- >>> drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- >>> drivers/net/ethernet/intel/igb/igb.h | 25 +-- >>> drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ >>> drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- >>> drivers/net/ethernet/marvell/mvneta.c | 9 +- >>> drivers/net/ethernet/marvell/sky2.h | 2 +- >>> drivers/tee/optee/call.c | 7 +- >>> drivers/tee/optee/smc_abi.c | 2 +- >>> drivers/virtio/virtio_balloon.c | 10 +- >>> drivers/xen/balloon.c | 11 +- >>> drivers/xen/biomerge.c | 12 +- >>> drivers/xen/privcmd.c | 2 +- >>> drivers/xen/xenbus/xenbus_client.c | 5 +- >>> drivers/xen/xlate_mmu.c | 6 +- >>> fs/binfmt_elf.c | 11 +- >>> fs/buffer.c | 2 +- >>> fs/coredump.c | 8 +- >>> fs/ext4/ext4.h | 36 ++-- >>> fs/ext4/move_extent.c | 2 +- >>> fs/ext4/readpage.c | 2 +- >>> fs/fat/dir.c | 4 +- >>> fs/fat/fatent.c | 4 +- >>> fs/nfs/nfs42proc.c | 2 +- >>> fs/nfs/nfs42xattr.c | 2 +- >>> fs/nfs/nfs4proc.c | 2 +- >>> include/asm-generic/pgtable-geometry.h | 71 +++++++ >>> include/asm-generic/vmlinux.lds.h | 38 ++-- >>> include/linux/buffer_head.h | 1 + >>> include/linux/cpumask.h | 5 + >>> include/linux/linkage.h | 4 +- >>> include/linux/mm.h | 17 +- >>> include/linux/mm_types.h | 15 +- >>> include/linux/mm_types_task.h | 2 +- >>> include/linux/mmzone.h | 3 +- >>> include/linux/netlink.h | 6 +- >>> include/linux/percpu-defs.h | 4 +- >>> include/linux/perf_event.h | 2 +- >>> include/linux/sched.h | 4 +- >>> include/linux/slab.h | 7 +- >>> include/linux/stackdepot.h | 6 +- >>> include/linux/sunrpc/svc.h | 8 +- >>> include/linux/sunrpc/svc_rdma.h | 4 +- >>> include/linux/sunrpc/svcsock.h | 2 +- >>> include/linux/swap.h | 17 +- >>> include/linux/swapops.h | 6 +- >>> include/linux/thread_info.h | 10 +- >>> include/xen/page.h | 2 + >>> init/main.c | 7 +- >>> kernel/bpf/core.c | 9 +- >>> kernel/bpf/ringbuf.c | 54 ++--- >>> kernel/cgroup/cgroup.c | 8 +- >>> kernel/crash_core.c | 2 +- >>> kernel/events/core.c | 2 +- >>> kernel/fork.c | 71 +++---- >>> kernel/power/power.h | 2 +- >>> kernel/power/snapshot.c | 2 +- >>> kernel/power/swap.c | 129 +++++++++-- >>> kernel/trace/fgraph.c | 2 +- >>> kernel/trace/trace.c | 2 +- >>> lib/stackdepot.c | 6 +- >>> mm/kasan/report.c | 3 +- >>> mm/memcontrol.c | 11 +- >>> mm/memory.c | 4 +- >>> mm/mmap.c | 2 +- >>> mm/page-writeback.c | 2 +- >>> mm/page_alloc.c | 31 +-- >>> mm/slub.c | 2 +- >>> mm/sparse.c | 2 +- >>> mm/swapfile.c | 2 +- >>> mm/vmalloc.c | 7 +- >>> net/9p/trans_virtio.c | 4 +- >>> net/core/hotdata.c | 4 +- >>> net/core/skbuff.c | 4 +- >>> net/core/sysctl_net_core.c | 2 +- >>> net/sunrpc/cache.c | 3 +- >>> net/unix/af_unix.c | 2 +- >>> sound/soc/soc-utils.c | 4 +- >>> virt/kvm/kvm_main.c | 2 +- >>> 172 files changed, 2185 insertions(+), 951 deletions(-) >>> create mode 100644 arch/arm64/include/asm/pgtable-geometry.h >>> create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c >>> create mode 100644 arch/arm64/mm/pgtable-geometry.c >>> create mode 100644 include/asm-generic/pgtable-geometry.h >>> >>> -- >>> 2.43.0 >>> >>> >> >> Hi Ryan, >> >> First off, this is excellent work! Your cover page was very detailed >> and made the patch set easier to understand. >> >> Some questions/comments: >> >> Once a kernel is booted with a certain page size, could there be issues >> if it is booted later with a different page size? How about if this is >> done frequently? > > I think that is the reason why you are only given the option in RHEL > to select the kernel (4K vs. 64K) to use at install time. > > Software can easily use a different data format for persistance based > on the base page size. I would suspect DBs might be the usual suspects. > > One example is swap space I think, where the base page size used when > formatting the device is used, and it cannot be used with a different > page size unless reformatting it. > > So ... one has to be a bit careful ... > Yes, that is what I was thinking. Once a userspace process does an I/O and if it is based on PAGE_SIZE things can go south. I think this is not an issue with THP, so maybe it's possible with boot-time page selection?
>>> Hi Ryan, >>> >>> First off, this is excellent work! Your cover page was very detailed >>> and made the patch set easier to understand. >>> >>> Some questions/comments: >>> >>> Once a kernel is booted with a certain page size, could there be issues >>> if it is booted later with a different page size? How about if this is >>> done frequently? >> >> I think that is the reason why you are only given the option in RHEL >> to select the kernel (4K vs. 64K) to use at install time. >> >> Software can easily use a different data format for persistance based >> on the base page size. I would suspect DBs might be the usual suspects. >> >> One example is swap space I think, where the base page size used when >> formatting the device is used, and it cannot be used with a different >> page size unless reformatting it. >> >> So ... one has to be a bit careful ... >> > Yes, that is what I was thinking. Once a userspace process does an I/O > and if it is based on PAGE_SIZE things can go south. I think this is > not an issue with THP, so maybe it's possible with boot-time page selection? THP is a different beast and has different semantics: the base page size doesn't change: the result of getpagesize() is unmodified ("transparent"). One would have to emulate for a given user space process a different page size ... and Ryan can likely tell some stories about that. Not that I consider it reasonable to have dynamic page sizes in the kernel and then try emulating a different one for all user space. -- Cheers, David / dhildenb
On 10/18/24 15:27, David Hildenbrand wrote: > >>>> Hi Ryan, >>>> >>>> First off, this is excellent work! Your cover page was very detailed >>>> and made the patch set easier to understand. >>>> >>>> Some questions/comments: >>>> >>>> Once a kernel is booted with a certain page size, could there be >>>> issues >>>> if it is booted later with a different page size? How about if >>>> this is >>>> done frequently? >>> >>> I think that is the reason why you are only given the option in RHEL >>> to select the kernel (4K vs. 64K) to use at install time. >>> >>> Software can easily use a different data format for persistance based >>> on the base page size. I would suspect DBs might be the usual suspects. >>> >>> One example is swap space I think, where the base page size used when >>> formatting the device is used, and it cannot be used with a different >>> page size unless reformatting it. >>> >>> So ... one has to be a bit careful ... >>> >> Yes, that is what I was thinking. Once a userspace process does an I/O >> and if it is based on PAGE_SIZE things can go south. I think this is >> not an issue with THP, so maybe it's possible with boot-time page >> selection? > > THP is a different beast and has different semantics: the base page > size doesn't change: the result of getpagesize() is unmodified > ("transparent"). > > One would have to emulate for a given user space process a different > page size ... and Ryan can likely tell some stories about that. > > Not that I consider it reasonable to have dynamic page sizes in the > kernel and then try emulating a different one for all user space. This is probably a case of ensuring proper documentation from the distro or application vendor. Or maybe some type of "Safety gate" could be implemented outside of the kernel. Some check for the prior use of different page sizes, in the cases where it could cause problems.
On 18/10/2024 21:06, Joseph Salisbury wrote: > > > > On 10/18/24 15:27, David Hildenbrand wrote: >> >>>>> Hi Ryan, >>>>> >>>>> First off, this is excellent work! Your cover page was very detailed >>>>> and made the patch set easier to understand. Thanks! >>>>> >>>>> Some questions/comments: >>>>> >>>>> Once a kernel is booted with a certain page size, could there be issues >>>>> if it is booted later with a different page size? How about if this is >>>>> done frequently? >>>> >>>> I think that is the reason why you are only given the option in RHEL >>>> to select the kernel (4K vs. 64K) to use at install time. >>>> >>>> Software can easily use a different data format for persistance based >>>> on the base page size. I would suspect DBs might be the usual suspects. >>>> >>>> One example is swap space I think, where the base page size used when >>>> formatting the device is used, and it cannot be used with a different >>>> page size unless reformatting it. >>>> >>>> So ... one has to be a bit careful ... >>>> >>> Yes, that is what I was thinking. Once a userspace process does an I/O >>> and if it is based on PAGE_SIZE things can go south. I think this is >>> not an issue with THP, so maybe it's possible with boot-time page selection? >> >> THP is a different beast and has different semantics: the base page size >> doesn't change: the result of getpagesize() is unmodified ("transparent"). >> >> One would have to emulate for a given user space process a different page >> size ... and Ryan can likely tell some stories about that. >> >> Not that I consider it reasonable to have dynamic page sizes in the kernel and >> then try emulating a different one for all user space. > > This is probably a case of ensuring proper documentation from the distro or > application vendor. > > Or maybe some type of "Safety gate" could be implemented outside of the kernel. > Some check for the prior use of different page sizes, in the cases where it > could cause problems. I agree there are likely to be problems in some corner cases if switching page size between boots, if persisted data makes assumptions about the page size. I would argue that any problems that are observed should really be considered bugs in the user space SW though. But I don't think this is really any different from today; With Ubuntu, for example, you can install both 4K and 64K kernels concurrently, then choose which one to boot via Grub. So the issue exists there already. This proposed boot-time page size selection series, doesn't make that any worse, it just simplifies the distribution model, given the reality that distros are now having to support multiple page sizes. Thanks, Ryan
On 10/14/24 5:55AM, Ryan Roberts wrote: > Hi All, > > Patch bomb incoming... This covers many subsystems, so I've included a core set > of people on the full series and additionally included maintainers on relevant > patches. I haven't included those maintainers on this cover letter since the > numbers were far too big for it to work. But I've included a link to this cover > letter on each patch, so they can hopefully find their way here. For follow up > submissions I'll break it up by subsystem, but for now thought it was important > to show the full picture. > > This RFC series implements support for boot-time page size selection within the > arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to date, page > size has been selected at compile-time, meaning the size is baked into a given > kernel image. As use of larger-than-4K page sizes become more prevalent this > starts to present a problem for distributions. Boot-time page size selection > enables the creation of a single kernel image, which can be told which page size > to use on the kernel command line. This looks really promising. Building and maintaining separate kernels is costly. Being able to build one kernel for three protential page sizes would not only cut down on the overhead of producing kernel packages and images, but also eases benchmarking and testing different page sizes without the need to build and install multiple kernels. I'm also impressed that the patches are less intrusive than I would have expected. I'm looking forward to seeing this project move forward. Thanks, Shaggy > > Why is having an image-per-page size problematic? > ================================================= > > Many traditional distros are now supporting both 4K and 64K. And this means > managing 2 kernel packages, along with drivers for each. For some, it means > multiple installer flavours and multiple ISOs. All of this adds up to a > less-than-ideal level of complexity. Additionally, Android now supports 4K and > 16K kernels. I'm told having to explicitly manage their KABI for each kernel is > painful, and the extra flash space required for both kernel images and the > duplicated modules has been problematic. Boot-time page size selection solves > all of this. > > Additionally, in starting to think about the longer term deployment story for > D128 page tables, which Arm architecture now supports, a lot of the same > problems need to be solved, so this work sets us up nicely for that. > > So what's the down side? > ======================== > > Well nothing's free; Various static allocations in the kernel image must be > sized for the worst case (largest supported page size), so image size is in line > with size of 64K compile-time image. So if you're interested in 4K or 16K, there > is a slight increase to the image size. But I expect that problem goes away if > you're compressing the image - its just some extra zeros. At boot-time, I expect > we could free the unused static storage once we know the page size - although > that would be a follow up enhancement. > > And then there is performance. Since PAGE_SIZE and friends are no longer > compile-time constants, we must look up their values and do arithmetic at > runtime instead of compile-time. My early perf testing suggests this is > inperceptible for real-world workloads, and only has small impact on > microbenchmarks - more on this below. > > Approach > ======== > > The basic idea is to rid the source of any assumptions that PAGE_SIZE and > friends are compile-time constant, but in a way that allows the compiler to > perform the same optimizations as was previously being done if they do turn out > to be compile-time constant. Where constants are required, we use limits; > PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full description > of all the classes of problems to solve. > > By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to > boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. arm64 > does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE Kconfig, > which is an alternative to selecting a compile-time page size. > > When boot-time page size is active, the arch pgtable geometry macro definitions > resolve to something that can be configured at boot. The arm64 implementation in > this series mainly uses global, __ro_after_init variables. I've tried using > alternatives patching, but that performs worse than loading from memory; I think > due to code size bloat. > > Status > ====== > > When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only implemented enough > to compile the kernel image itself with defconfig (and a few other bits and > pieces). This is enough to build a kernel that can boot under QEMU or FVP. I'll > happily do the rest of the work to enable all the extra drivers, but wanted to > get feedback on the shape of this effort first. If anyone wants to do any > testing, and has a must-have config, let me know and I'll prioritize enabling it > first. > > The series is arranged as follows: > > - patch 1: Add macros required for converting non-arch code to support > boot-time page size selection > - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from all > non-arch code > - patches 37-38: Some arm64 tidy ups > - patch 39: Add macros required for converting arm64 code to support > boot-time page size selection > - patches 40-56: arm64 changes to support boot-time page size selection > - patch 57: Add arm64 Kconfig option to enable boot-time page size > selection > > Ideally, I'd like to get the basics merged (something like this series), then > incrementally improve it over a handful of kernel releases until we can > demonstrate that we have feature parity with the compile-time build and no > performance blockers. Once at that point, ideally the compile-time build options > would be removed and the code could be cleaned up further. > > One of the bigger peices that I'd propose to add as a follow up, is to make > va-size boot-time selectable too. That will greatly simplify LPA2 fallback > handling. > > Assuming people are ammenable to the rough shape, how would I go about getting > the non-arch changes merged? Since they cover many subsystems, will each piece > need to go independently to each relevant maintainer or could it all be merged > together through the arm64 tree? > > Image Size > ========== > > The below shows the size of a defconfig (+ xfs, squashfs, ftrace, kprobes) > kernel image on disk for base (before any changes applied), compile (with > changes, configured for compile-time page size) and boot (with changes, > configured for boot-time page size). > > You can see the that compile-16k and 64k configs are actually slightly smaller > than the baselines; that's due to optimizing some buffer sizes which didn't need > to depend on page size during the series. The boot-time image is ~1% bigger than > the 64k compile-time image. I believe there is scope to improve this to make it > equal to compile-64k if required: > > | config | size/KB | diff/KB | diff/% | > |-------------|---------|---------|---------| > | base-4k | 54895 | 0 | 0.0% | > | base-16k | 55161 | 266 | 0.5% | > | base-64k | 56775 | 1880 | 3.4% | > | compile-4k | 54895 | 0 | 0.0% | > | compile-16k | 55097 | 202 | 0.4% | > | compile-64k | 56391 | 1496 | 2.7% | > | boot-4K | 57045 | 2150 | 3.9% | > > And below shows the size of the image in memory at run-time, separated for text > and data costs. The boot image has ~1% text cost; most likely due to the fact > that PAGE_SIZE and friends are not compile-time constants so need instructions > to load the values and do arithmetic. I believe we could eventually get the data > cost to match the cost for the compile image for the chosen page size by freeing > the ends of the static buffers not needed for the selected page size: > > | | text | text | text | data | data | data | > | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | diff/% | > |-------------|---------|---------|---------|---------|---------|---------| > | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | > | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | > | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | 9.5% | > | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | > | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | > | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | > | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | > > Functional Testing > ================== > > I've build-tested defconfig for all arches supported by tuxmake (which is most) > without issue. > > I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all page sizes > and a few va-sizes, and additionally have run all the mm-selftests, with no > regressions observed vs the equivalent compile-time page size build (although > the mm-selftests have a few existing failures when run against 16K and 64K > kernels - those should really be investigated and fixed independently). > > Test coverage is lacking for many of the drivers that I've touched, but in many > cases, I'm hoping the changes are simple enough that review might suffice? > > Performance Testing > =================== > > I've run some limited performance benchmarks: > > First, a real-world benchmark that causes a lot of page table manipulation (and > therefore we would expect to see regression here if we are going to see it > anywhere); kernel compilation. It barely registers a change. Values are times, > so smaller is better. All relative to base-4k: > > | | kern | kern | user | user | real | real | > | config | mean | stdev | mean | stdev | mean | stdev | > |-------------|---------|---------|---------|---------|---------|---------| > | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | > | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | > | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | > > The Speedometer JavaScript benchmark also shows no change. Values are runs per > min, so bigger is better. All relative to base-4k: > > | config | mean | stdev | > |-------------|---------|---------| > | base-4k | 0.0% | 0.8% | > | compile-4k | 0.4% | 0.8% | > | boot-4k | 0.0% | 0.9% | > > Finally, I've run some microbenchmarks known to stress page table manipulations > (originally from David Hildenbrand). The fork test maps/allocs 1G of anon > memory, then measures the cost of fork(). The munmap test maps/allocs 1G of anon > memory then measures the cost of munmap()ing it. The fork test is known to be > extremely sensitive to any changes that cause instructions to be aligned > differently in cachelines. When using this test for other changes, I've seen > double digit regressions for the slightest thing, so 12% regression on this test > is actually fairly good. This likely represents the extreme worst case for > regressions that will be observed across other microbenchmarks (famous last > words). Values are times, so smaller is better. All relative to base-4k: > > | | fork | fork | munmap | munmap | > | config | mean | stdev | stdev | stdev | > |-------------|---------|---------|---------|---------| > | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | > | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | > | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | > > NOTE: The series applies on top of v6.11. > > Thanks, > Ryan > > > Ryan Roberts (57): > mm: Add macros ahead of supporting boot-time page size selection > vmlinux: Align to PAGE_SIZE_MAX > mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is large > mm/page_alloc: Make page_frag_cache boot-time page size compatible > mm: Avoid split pmd ptl if pmd level is run-time folded > mm: Remove PAGE_SIZE compile-time constant assumption > fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing > fs: Remove PAGE_SIZE compile-time constant assumption > fs/nfs: Remove PAGE_SIZE compile-time constant assumption > fs/ext4: Remove PAGE_SIZE compile-time constant assumption > fork: Permit boot-time THREAD_SIZE determination > cgroup: Remove PAGE_SIZE compile-time constant assumption > bpf: Remove PAGE_SIZE compile-time constant assumption > pm/hibernate: Remove PAGE_SIZE compile-time constant assumption > stackdepot: Remove PAGE_SIZE compile-time constant assumption > perf: Remove PAGE_SIZE compile-time constant assumption > kvm: Remove PAGE_SIZE compile-time constant assumption > trace: Remove PAGE_SIZE compile-time constant assumption > crash: Remove PAGE_SIZE compile-time constant assumption > crypto: Remove PAGE_SIZE compile-time constant assumption > sunrpc: Remove PAGE_SIZE compile-time constant assumption > sound: Remove PAGE_SIZE compile-time constant assumption > net: Remove PAGE_SIZE compile-time constant assumption > net: fec: Remove PAGE_SIZE compile-time constant assumption > net: marvell: Remove PAGE_SIZE compile-time constant assumption > net: hns3: Remove PAGE_SIZE compile-time constant assumption > net: e1000: Remove PAGE_SIZE compile-time constant assumption > net: igbvf: Remove PAGE_SIZE compile-time constant assumption > net: igb: Remove PAGE_SIZE compile-time constant assumption > drivers/base: Remove PAGE_SIZE compile-time constant assumption > edac: Remove PAGE_SIZE compile-time constant assumption > optee: Remove PAGE_SIZE compile-time constant assumption > random: Remove PAGE_SIZE compile-time constant assumption > sata_sil24: Remove PAGE_SIZE compile-time constant assumption > virtio: Remove PAGE_SIZE compile-time constant assumption > xen: Remove PAGE_SIZE compile-time constant assumption > arm64: Fix macros to work in C code in addition to the linker script > arm64: Track early pgtable allocation limit > arm64: Introduce macros required for boot-time page selection > arm64: Refactor early pgtable size calculation macros > arm64: Pass desired page size on command line > arm64: Divorce early init from PAGE_SIZE > arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES > arm64: Align sections to PAGE_SIZE_MAX > arm64: Rework trampoline rodata mapping > arm64: Generalize fixmap for boot-time page size > arm64: Statically allocate and align for worst-case page size > arm64: Convert switch to if for non-const comparison values > arm64: Convert BUILD_BUG_ON to VM_BUG_ON > arm64: Remove PAGE_SZ asm-offset > arm64: Introduce cpu features for page sizes > arm64: Remove PAGE_SIZE from assembly code > arm64: Runtime-fold pmd level > arm64: Support runtime folding in idmap_kpti_install_ng_mappings > arm64: TRAMP_VALIAS is no longer compile-time constant > arm64: Determine THREAD_SIZE at boot-time > arm64: Enable boot-time page size selection > > arch/alpha/include/asm/page.h | 1 + > arch/arc/include/asm/page.h | 1 + > arch/arm/include/asm/page.h | 1 + > arch/arm64/Kconfig | 26 ++- > arch/arm64/include/asm/assembler.h | 78 ++++++- > arch/arm64/include/asm/cpufeature.h | 44 +++- > arch/arm64/include/asm/efi.h | 2 +- > arch/arm64/include/asm/fixmap.h | 28 ++- > arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- > arch/arm64/include/asm/kvm_arm.h | 21 +- > arch/arm64/include/asm/kvm_hyp.h | 11 + > arch/arm64/include/asm/kvm_pgtable.h | 6 +- > arch/arm64/include/asm/memory.h | 62 ++++-- > arch/arm64/include/asm/page-def.h | 3 +- > arch/arm64/include/asm/pgalloc.h | 16 +- > arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ > arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- > arch/arm64/include/asm/pgtable-prot.h | 2 +- > arch/arm64/include/asm/pgtable.h | 133 +++++++++--- > arch/arm64/include/asm/processor.h | 10 +- > arch/arm64/include/asm/sections.h | 1 + > arch/arm64/include/asm/smp.h | 1 + > arch/arm64/include/asm/sparsemem.h | 15 +- > arch/arm64/include/asm/sysreg.h | 54 +++-- > arch/arm64/include/asm/tlb.h | 3 + > arch/arm64/kernel/asm-offsets.c | 4 +- > arch/arm64/kernel/cpufeature.c | 93 ++++++-- > arch/arm64/kernel/efi.c | 2 +- > arch/arm64/kernel/entry.S | 60 +++++- > arch/arm64/kernel/head.S | 46 +++- > arch/arm64/kernel/hibernate-asm.S | 6 +- > arch/arm64/kernel/image-vars.h | 14 ++ > arch/arm64/kernel/image.h | 4 + > arch/arm64/kernel/pi/idreg-override.c | 68 +++++- > arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- > arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- > arch/arm64/kernel/pi/pi.h | 63 +++++- > arch/arm64/kernel/relocate_kernel.S | 10 +- > arch/arm64/kernel/vdso-wrap.S | 4 +- > arch/arm64/kernel/vdso.c | 7 +- > arch/arm64/kernel/vdso/vdso.lds.S | 4 +- > arch/arm64/kernel/vdso32-wrap.S | 4 +- > arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- > arch/arm64/kernel/vmlinux.lds.S | 48 +++-- > arch/arm64/kvm/arm.c | 10 + > arch/arm64/kvm/hyp/nvhe/Makefile | 1 + > arch/arm64/kvm/hyp/nvhe/host.S | 10 +- > arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- > arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ > arch/arm64/kvm/mmu.c | 39 ++-- > arch/arm64/lib/clear_page.S | 7 +- > arch/arm64/lib/copy_page.S | 33 ++- > arch/arm64/lib/mte.S | 27 ++- > arch/arm64/mm/Makefile | 1 + > arch/arm64/mm/fixmap.c | 38 ++-- > arch/arm64/mm/hugetlbpage.c | 40 +--- > arch/arm64/mm/init.c | 26 +-- > arch/arm64/mm/kasan_init.c | 8 +- > arch/arm64/mm/mmu.c | 53 +++-- > arch/arm64/mm/pgd.c | 12 +- > arch/arm64/mm/pgtable-geometry.c | 24 +++ > arch/arm64/mm/proc.S | 128 ++++++++--- > arch/arm64/mm/ptdump.c | 3 +- > arch/arm64/tools/cpucaps | 3 + > arch/csky/include/asm/page.h | 3 + > arch/hexagon/include/asm/page.h | 2 + > arch/loongarch/include/asm/page.h | 2 + > arch/m68k/include/asm/page.h | 1 + > arch/microblaze/include/asm/page.h | 1 + > arch/mips/include/asm/page.h | 1 + > arch/nios2/include/asm/page.h | 2 + > arch/openrisc/include/asm/page.h | 1 + > arch/parisc/include/asm/page.h | 1 + > arch/powerpc/include/asm/page.h | 2 + > arch/riscv/include/asm/page.h | 1 + > arch/s390/include/asm/page.h | 1 + > arch/sh/include/asm/page.h | 1 + > arch/sparc/include/asm/page.h | 3 + > arch/um/include/asm/page.h | 2 + > arch/x86/include/asm/page_types.h | 2 + > arch/xtensa/include/asm/page.h | 1 + > crypto/lskcipher.c | 4 +- > drivers/ata/sata_sil24.c | 46 ++-- > drivers/base/node.c | 6 +- > drivers/base/topology.c | 32 +-- > drivers/block/virtio_blk.c | 2 +- > drivers/char/random.c | 4 +- > drivers/edac/edac_mc.h | 13 +- > drivers/firmware/efi/libstub/arm64.c | 3 +- > drivers/irqchip/irq-gic-v3-its.c | 2 +- > drivers/mtd/mtdswap.c | 4 +- > drivers/net/ethernet/freescale/fec.h | 3 +- > drivers/net/ethernet/freescale/fec_main.c | 5 +- > .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- > drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- > drivers/net/ethernet/intel/igb/igb.h | 25 +-- > drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ > drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- > drivers/net/ethernet/marvell/mvneta.c | 9 +- > drivers/net/ethernet/marvell/sky2.h | 2 +- > drivers/tee/optee/call.c | 7 +- > drivers/tee/optee/smc_abi.c | 2 +- > drivers/virtio/virtio_balloon.c | 10 +- > drivers/xen/balloon.c | 11 +- > drivers/xen/biomerge.c | 12 +- > drivers/xen/privcmd.c | 2 +- > drivers/xen/xenbus/xenbus_client.c | 5 +- > drivers/xen/xlate_mmu.c | 6 +- > fs/binfmt_elf.c | 11 +- > fs/buffer.c | 2 +- > fs/coredump.c | 8 +- > fs/ext4/ext4.h | 36 ++-- > fs/ext4/move_extent.c | 2 +- > fs/ext4/readpage.c | 2 +- > fs/fat/dir.c | 4 +- > fs/fat/fatent.c | 4 +- > fs/nfs/nfs42proc.c | 2 +- > fs/nfs/nfs42xattr.c | 2 +- > fs/nfs/nfs4proc.c | 2 +- > include/asm-generic/pgtable-geometry.h | 71 +++++++ > include/asm-generic/vmlinux.lds.h | 38 ++-- > include/linux/buffer_head.h | 1 + > include/linux/cpumask.h | 5 + > include/linux/linkage.h | 4 +- > include/linux/mm.h | 17 +- > include/linux/mm_types.h | 15 +- > include/linux/mm_types_task.h | 2 +- > include/linux/mmzone.h | 3 +- > include/linux/netlink.h | 6 +- > include/linux/percpu-defs.h | 4 +- > include/linux/perf_event.h | 2 +- > include/linux/sched.h | 4 +- > include/linux/slab.h | 7 +- > include/linux/stackdepot.h | 6 +- > include/linux/sunrpc/svc.h | 8 +- > include/linux/sunrpc/svc_rdma.h | 4 +- > include/linux/sunrpc/svcsock.h | 2 +- > include/linux/swap.h | 17 +- > include/linux/swapops.h | 6 +- > include/linux/thread_info.h | 10 +- > include/xen/page.h | 2 + > init/main.c | 7 +- > kernel/bpf/core.c | 9 +- > kernel/bpf/ringbuf.c | 54 ++--- > kernel/cgroup/cgroup.c | 8 +- > kernel/crash_core.c | 2 +- > kernel/events/core.c | 2 +- > kernel/fork.c | 71 +++---- > kernel/power/power.h | 2 +- > kernel/power/snapshot.c | 2 +- > kernel/power/swap.c | 129 +++++++++-- > kernel/trace/fgraph.c | 2 +- > kernel/trace/trace.c | 2 +- > lib/stackdepot.c | 6 +- > mm/kasan/report.c | 3 +- > mm/memcontrol.c | 11 +- > mm/memory.c | 4 +- > mm/mmap.c | 2 +- > mm/page-writeback.c | 2 +- > mm/page_alloc.c | 31 +-- > mm/slub.c | 2 +- > mm/sparse.c | 2 +- > mm/swapfile.c | 2 +- > mm/vmalloc.c | 7 +- > net/9p/trans_virtio.c | 4 +- > net/core/hotdata.c | 4 +- > net/core/skbuff.c | 4 +- > net/core/sysctl_net_core.c | 2 +- > net/sunrpc/cache.c | 3 +- > net/unix/af_unix.c | 2 +- > sound/soc/soc-utils.c | 4 +- > virt/kvm/kvm_main.c | 2 +- > 172 files changed, 2185 insertions(+), 951 deletions(-) > create mode 100644 arch/arm64/include/asm/pgtable-geometry.h > create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c > create mode 100644 arch/arm64/mm/pgtable-geometry.c > create mode 100644 include/asm-generic/pgtable-geometry.h > > -- > 2.43.0 >
On 17/10/2024 23:05, Dave Kleikamp wrote: > On 10/14/24 5:55AM, Ryan Roberts wrote: >> Hi All, >> >> Patch bomb incoming... This covers many subsystems, so I've included a core set >> of people on the full series and additionally included maintainers on relevant >> patches. I haven't included those maintainers on this cover letter since the >> numbers were far too big for it to work. But I've included a link to this cover >> letter on each patch, so they can hopefully find their way here. For follow up >> submissions I'll break it up by subsystem, but for now thought it was important >> to show the full picture. >> >> This RFC series implements support for boot-time page size selection within the >> arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to date, page >> size has been selected at compile-time, meaning the size is baked into a given >> kernel image. As use of larger-than-4K page sizes become more prevalent this >> starts to present a problem for distributions. Boot-time page size selection >> enables the creation of a single kernel image, which can be told which page size >> to use on the kernel command line. > > This looks really promising. Building and maintaining separate kernels is > costly. Being able to build one kernel for three protential page sizes would not > only cut down on the overhead of producing kernel packages and images, but also > eases benchmarking and testing different page sizes without the need to build > and install multiple kernels. > > I'm also impressed that the patches are less intrusive than I would have > expected. I'm looking forward to seeing this project move forward. Thanks for the feedback! I'm sure any review/test capacity that Oracle has would be greatly appreciated :) Thanks, Ryan > > Thanks, > Shaggy > >> >> Why is having an image-per-page size problematic? >> ================================================= >> >> Many traditional distros are now supporting both 4K and 64K. And this means >> managing 2 kernel packages, along with drivers for each. For some, it means >> multiple installer flavours and multiple ISOs. All of this adds up to a >> less-than-ideal level of complexity. Additionally, Android now supports 4K and >> 16K kernels. I'm told having to explicitly manage their KABI for each kernel is >> painful, and the extra flash space required for both kernel images and the >> duplicated modules has been problematic. Boot-time page size selection solves >> all of this. >> >> Additionally, in starting to think about the longer term deployment story for >> D128 page tables, which Arm architecture now supports, a lot of the same >> problems need to be solved, so this work sets us up nicely for that. >> >> So what's the down side? >> ======================== >> >> Well nothing's free; Various static allocations in the kernel image must be >> sized for the worst case (largest supported page size), so image size is in line >> with size of 64K compile-time image. So if you're interested in 4K or 16K, there >> is a slight increase to the image size. But I expect that problem goes away if >> you're compressing the image - its just some extra zeros. At boot-time, I expect >> we could free the unused static storage once we know the page size - although >> that would be a follow up enhancement. >> >> And then there is performance. Since PAGE_SIZE and friends are no longer >> compile-time constants, we must look up their values and do arithmetic at >> runtime instead of compile-time. My early perf testing suggests this is >> inperceptible for real-world workloads, and only has small impact on >> microbenchmarks - more on this below. >> >> Approach >> ======== >> >> The basic idea is to rid the source of any assumptions that PAGE_SIZE and >> friends are compile-time constant, but in a way that allows the compiler to >> perform the same optimizations as was previously being done if they do turn out >> to be compile-time constant. Where constants are required, we use limits; >> PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full description >> of all the classes of problems to solve. >> >> By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to >> boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. arm64 >> does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE Kconfig, >> which is an alternative to selecting a compile-time page size. >> >> When boot-time page size is active, the arch pgtable geometry macro definitions >> resolve to something that can be configured at boot. The arm64 implementation in >> this series mainly uses global, __ro_after_init variables. I've tried using >> alternatives patching, but that performs worse than loading from memory; I think >> due to code size bloat. >> >> Status >> ====== >> >> When CONFIG_ARM64_BOOT_TIME_PAGE_SIZE is selected, I've only implemented enough >> to compile the kernel image itself with defconfig (and a few other bits and >> pieces). This is enough to build a kernel that can boot under QEMU or FVP. I'll >> happily do the rest of the work to enable all the extra drivers, but wanted to >> get feedback on the shape of this effort first. If anyone wants to do any >> testing, and has a must-have config, let me know and I'll prioritize enabling it >> first. >> >> The series is arranged as follows: >> >> - patch 1: Add macros required for converting non-arch code to support >> boot-time page size selection >> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from all >> non-arch code >> - patches 37-38: Some arm64 tidy ups >> - patch 39: Add macros required for converting arm64 code to support >> boot-time page size selection >> - patches 40-56: arm64 changes to support boot-time page size selection >> - patch 57: Add arm64 Kconfig option to enable boot-time page size >> selection >> >> Ideally, I'd like to get the basics merged (something like this series), then >> incrementally improve it over a handful of kernel releases until we can >> demonstrate that we have feature parity with the compile-time build and no >> performance blockers. Once at that point, ideally the compile-time build options >> would be removed and the code could be cleaned up further. >> >> One of the bigger peices that I'd propose to add as a follow up, is to make >> va-size boot-time selectable too. That will greatly simplify LPA2 fallback >> handling. >> >> Assuming people are ammenable to the rough shape, how would I go about getting >> the non-arch changes merged? Since they cover many subsystems, will each piece >> need to go independently to each relevant maintainer or could it all be merged >> together through the arm64 tree? >> >> Image Size >> ========== >> >> The below shows the size of a defconfig (+ xfs, squashfs, ftrace, kprobes) >> kernel image on disk for base (before any changes applied), compile (with >> changes, configured for compile-time page size) and boot (with changes, >> configured for boot-time page size). >> >> You can see the that compile-16k and 64k configs are actually slightly smaller >> than the baselines; that's due to optimizing some buffer sizes which didn't need >> to depend on page size during the series. The boot-time image is ~1% bigger than >> the 64k compile-time image. I believe there is scope to improve this to make it >> equal to compile-64k if required: >> >> | config | size/KB | diff/KB | diff/% | >> |-------------|---------|---------|---------| >> | base-4k | 54895 | 0 | 0.0% | >> | base-16k | 55161 | 266 | 0.5% | >> | base-64k | 56775 | 1880 | 3.4% | >> | compile-4k | 54895 | 0 | 0.0% | >> | compile-16k | 55097 | 202 | 0.4% | >> | compile-64k | 56391 | 1496 | 2.7% | >> | boot-4K | 57045 | 2150 | 3.9% | >> >> And below shows the size of the image in memory at run-time, separated for text >> and data costs. The boot image has ~1% text cost; most likely due to the fact >> that PAGE_SIZE and friends are not compile-time constants so need instructions >> to load the values and do arithmetic. I believe we could eventually get the data >> cost to match the cost for the compile image for the chosen page size by freeing >> the ends of the static buffers not needed for the selected page size: >> >> | | text | text | text | data | data | data | >> | config | size/KB | diff/KB | diff/% | size/KB | diff/KB | diff/% | >> |-------------|---------|---------|---------|---------|---------|---------| >> | base-4k | 20561 | 0 | 0.0% | 14314 | 0 | 0.0% | >> | base-16k | 20439 | -122 | -0.6% | 14625 | 311 | 2.2% | >> | base-64k | 20435 | -126 | -0.6% | 15673 | 1359 | 9.5% | >> | compile-4k | 20565 | 4 | 0.0% | 14315 | 1 | 0.0% | >> | compile-16k | 20443 | -118 | -0.6% | 14517 | 204 | 1.4% | >> | compile-64k | 20439 | -122 | -0.6% | 15134 | 820 | 5.7% | >> | boot-4K | 20811 | 250 | 1.2% | 15287 | 973 | 6.8% | >> >> Functional Testing >> ================== >> >> I've build-tested defconfig for all arches supported by tuxmake (which is most) >> without issue. >> >> I've boot-tested arm64 with CONFIG_ARM64_BOOT_TIME_PAGE_SIZE for all page sizes >> and a few va-sizes, and additionally have run all the mm-selftests, with no >> regressions observed vs the equivalent compile-time page size build (although >> the mm-selftests have a few existing failures when run against 16K and 64K >> kernels - those should really be investigated and fixed independently). >> >> Test coverage is lacking for many of the drivers that I've touched, but in many >> cases, I'm hoping the changes are simple enough that review might suffice? >> >> Performance Testing >> =================== >> >> I've run some limited performance benchmarks: >> >> First, a real-world benchmark that causes a lot of page table manipulation (and >> therefore we would expect to see regression here if we are going to see it >> anywhere); kernel compilation. It barely registers a change. Values are times, >> so smaller is better. All relative to base-4k: >> >> | | kern | kern | user | user | real | real | >> | config | mean | stdev | mean | stdev | mean | stdev | >> |-------------|---------|---------|---------|---------|---------|---------| >> | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | >> | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | >> | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | >> >> The Speedometer JavaScript benchmark also shows no change. Values are runs per >> min, so bigger is better. All relative to base-4k: >> >> | config | mean | stdev | >> |-------------|---------|---------| >> | base-4k | 0.0% | 0.8% | >> | compile-4k | 0.4% | 0.8% | >> | boot-4k | 0.0% | 0.9% | >> >> Finally, I've run some microbenchmarks known to stress page table manipulations >> (originally from David Hildenbrand). The fork test maps/allocs 1G of anon >> memory, then measures the cost of fork(). The munmap test maps/allocs 1G of anon >> memory then measures the cost of munmap()ing it. The fork test is known to be >> extremely sensitive to any changes that cause instructions to be aligned >> differently in cachelines. When using this test for other changes, I've seen >> double digit regressions for the slightest thing, so 12% regression on this test >> is actually fairly good. This likely represents the extreme worst case for >> regressions that will be observed across other microbenchmarks (famous last >> words). Values are times, so smaller is better. All relative to base-4k: >> >> | | fork | fork | munmap | munmap | >> | config | mean | stdev | stdev | stdev | >> |-------------|---------|---------|---------|---------| >> | base-4k | 0.0% | 1.3% | 0.0% | 0.3% | >> | compile-4k | 0.1% | 1.3% | -0.9% | 0.1% | >> | boot-4k | 12.8% | 1.2% | 3.8% | 1.0% | >> >> NOTE: The series applies on top of v6.11. >> >> Thanks, >> Ryan >> >> >> Ryan Roberts (57): >> mm: Add macros ahead of supporting boot-time page size selection >> vmlinux: Align to PAGE_SIZE_MAX >> mm/memcontrol: Fix seq_buf size to save memory when PAGE_SIZE is large >> mm/page_alloc: Make page_frag_cache boot-time page size compatible >> mm: Avoid split pmd ptl if pmd level is run-time folded >> mm: Remove PAGE_SIZE compile-time constant assumption >> fs: Introduce MAX_BUF_PER_PAGE_SIZE_MAX for array sizing >> fs: Remove PAGE_SIZE compile-time constant assumption >> fs/nfs: Remove PAGE_SIZE compile-time constant assumption >> fs/ext4: Remove PAGE_SIZE compile-time constant assumption >> fork: Permit boot-time THREAD_SIZE determination >> cgroup: Remove PAGE_SIZE compile-time constant assumption >> bpf: Remove PAGE_SIZE compile-time constant assumption >> pm/hibernate: Remove PAGE_SIZE compile-time constant assumption >> stackdepot: Remove PAGE_SIZE compile-time constant assumption >> perf: Remove PAGE_SIZE compile-time constant assumption >> kvm: Remove PAGE_SIZE compile-time constant assumption >> trace: Remove PAGE_SIZE compile-time constant assumption >> crash: Remove PAGE_SIZE compile-time constant assumption >> crypto: Remove PAGE_SIZE compile-time constant assumption >> sunrpc: Remove PAGE_SIZE compile-time constant assumption >> sound: Remove PAGE_SIZE compile-time constant assumption >> net: Remove PAGE_SIZE compile-time constant assumption >> net: fec: Remove PAGE_SIZE compile-time constant assumption >> net: marvell: Remove PAGE_SIZE compile-time constant assumption >> net: hns3: Remove PAGE_SIZE compile-time constant assumption >> net: e1000: Remove PAGE_SIZE compile-time constant assumption >> net: igbvf: Remove PAGE_SIZE compile-time constant assumption >> net: igb: Remove PAGE_SIZE compile-time constant assumption >> drivers/base: Remove PAGE_SIZE compile-time constant assumption >> edac: Remove PAGE_SIZE compile-time constant assumption >> optee: Remove PAGE_SIZE compile-time constant assumption >> random: Remove PAGE_SIZE compile-time constant assumption >> sata_sil24: Remove PAGE_SIZE compile-time constant assumption >> virtio: Remove PAGE_SIZE compile-time constant assumption >> xen: Remove PAGE_SIZE compile-time constant assumption >> arm64: Fix macros to work in C code in addition to the linker script >> arm64: Track early pgtable allocation limit >> arm64: Introduce macros required for boot-time page selection >> arm64: Refactor early pgtable size calculation macros >> arm64: Pass desired page size on command line >> arm64: Divorce early init from PAGE_SIZE >> arm64: Clean up simple cases of CONFIG_ARM64_*K_PAGES >> arm64: Align sections to PAGE_SIZE_MAX >> arm64: Rework trampoline rodata mapping >> arm64: Generalize fixmap for boot-time page size >> arm64: Statically allocate and align for worst-case page size >> arm64: Convert switch to if for non-const comparison values >> arm64: Convert BUILD_BUG_ON to VM_BUG_ON >> arm64: Remove PAGE_SZ asm-offset >> arm64: Introduce cpu features for page sizes >> arm64: Remove PAGE_SIZE from assembly code >> arm64: Runtime-fold pmd level >> arm64: Support runtime folding in idmap_kpti_install_ng_mappings >> arm64: TRAMP_VALIAS is no longer compile-time constant >> arm64: Determine THREAD_SIZE at boot-time >> arm64: Enable boot-time page size selection >> >> arch/alpha/include/asm/page.h | 1 + >> arch/arc/include/asm/page.h | 1 + >> arch/arm/include/asm/page.h | 1 + >> arch/arm64/Kconfig | 26 ++- >> arch/arm64/include/asm/assembler.h | 78 ++++++- >> arch/arm64/include/asm/cpufeature.h | 44 +++- >> arch/arm64/include/asm/efi.h | 2 +- >> arch/arm64/include/asm/fixmap.h | 28 ++- >> arch/arm64/include/asm/kernel-pgtable.h | 150 +++++++++---- >> arch/arm64/include/asm/kvm_arm.h | 21 +- >> arch/arm64/include/asm/kvm_hyp.h | 11 + >> arch/arm64/include/asm/kvm_pgtable.h | 6 +- >> arch/arm64/include/asm/memory.h | 62 ++++-- >> arch/arm64/include/asm/page-def.h | 3 +- >> arch/arm64/include/asm/pgalloc.h | 16 +- >> arch/arm64/include/asm/pgtable-geometry.h | 46 ++++ >> arch/arm64/include/asm/pgtable-hwdef.h | 28 ++- >> arch/arm64/include/asm/pgtable-prot.h | 2 +- >> arch/arm64/include/asm/pgtable.h | 133 +++++++++--- >> arch/arm64/include/asm/processor.h | 10 +- >> arch/arm64/include/asm/sections.h | 1 + >> arch/arm64/include/asm/smp.h | 1 + >> arch/arm64/include/asm/sparsemem.h | 15 +- >> arch/arm64/include/asm/sysreg.h | 54 +++-- >> arch/arm64/include/asm/tlb.h | 3 + >> arch/arm64/kernel/asm-offsets.c | 4 +- >> arch/arm64/kernel/cpufeature.c | 93 ++++++-- >> arch/arm64/kernel/efi.c | 2 +- >> arch/arm64/kernel/entry.S | 60 +++++- >> arch/arm64/kernel/head.S | 46 +++- >> arch/arm64/kernel/hibernate-asm.S | 6 +- >> arch/arm64/kernel/image-vars.h | 14 ++ >> arch/arm64/kernel/image.h | 4 + >> arch/arm64/kernel/pi/idreg-override.c | 68 +++++- >> arch/arm64/kernel/pi/map_kernel.c | 165 ++++++++++---- >> arch/arm64/kernel/pi/map_range.c | 201 ++++++++++++++++-- >> arch/arm64/kernel/pi/pi.h | 63 +++++- >> arch/arm64/kernel/relocate_kernel.S | 10 +- >> arch/arm64/kernel/vdso-wrap.S | 4 +- >> arch/arm64/kernel/vdso.c | 7 +- >> arch/arm64/kernel/vdso/vdso.lds.S | 4 +- >> arch/arm64/kernel/vdso32-wrap.S | 4 +- >> arch/arm64/kernel/vdso32/vdso.lds.S | 4 +- >> arch/arm64/kernel/vmlinux.lds.S | 48 +++-- >> arch/arm64/kvm/arm.c | 10 + >> arch/arm64/kvm/hyp/nvhe/Makefile | 1 + >> arch/arm64/kvm/hyp/nvhe/host.S | 10 +- >> arch/arm64/kvm/hyp/nvhe/hyp.lds.S | 4 +- >> arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c | 16 ++ >> arch/arm64/kvm/mmu.c | 39 ++-- >> arch/arm64/lib/clear_page.S | 7 +- >> arch/arm64/lib/copy_page.S | 33 ++- >> arch/arm64/lib/mte.S | 27 ++- >> arch/arm64/mm/Makefile | 1 + >> arch/arm64/mm/fixmap.c | 38 ++-- >> arch/arm64/mm/hugetlbpage.c | 40 +--- >> arch/arm64/mm/init.c | 26 +-- >> arch/arm64/mm/kasan_init.c | 8 +- >> arch/arm64/mm/mmu.c | 53 +++-- >> arch/arm64/mm/pgd.c | 12 +- >> arch/arm64/mm/pgtable-geometry.c | 24 +++ >> arch/arm64/mm/proc.S | 128 ++++++++--- >> arch/arm64/mm/ptdump.c | 3 +- >> arch/arm64/tools/cpucaps | 3 + >> arch/csky/include/asm/page.h | 3 + >> arch/hexagon/include/asm/page.h | 2 + >> arch/loongarch/include/asm/page.h | 2 + >> arch/m68k/include/asm/page.h | 1 + >> arch/microblaze/include/asm/page.h | 1 + >> arch/mips/include/asm/page.h | 1 + >> arch/nios2/include/asm/page.h | 2 + >> arch/openrisc/include/asm/page.h | 1 + >> arch/parisc/include/asm/page.h | 1 + >> arch/powerpc/include/asm/page.h | 2 + >> arch/riscv/include/asm/page.h | 1 + >> arch/s390/include/asm/page.h | 1 + >> arch/sh/include/asm/page.h | 1 + >> arch/sparc/include/asm/page.h | 3 + >> arch/um/include/asm/page.h | 2 + >> arch/x86/include/asm/page_types.h | 2 + >> arch/xtensa/include/asm/page.h | 1 + >> crypto/lskcipher.c | 4 +- >> drivers/ata/sata_sil24.c | 46 ++-- >> drivers/base/node.c | 6 +- >> drivers/base/topology.c | 32 +-- >> drivers/block/virtio_blk.c | 2 +- >> drivers/char/random.c | 4 +- >> drivers/edac/edac_mc.h | 13 +- >> drivers/firmware/efi/libstub/arm64.c | 3 +- >> drivers/irqchip/irq-gic-v3-its.c | 2 +- >> drivers/mtd/mtdswap.c | 4 +- >> drivers/net/ethernet/freescale/fec.h | 3 +- >> drivers/net/ethernet/freescale/fec_main.c | 5 +- >> .../net/ethernet/hisilicon/hns3/hns3_enet.h | 4 +- >> drivers/net/ethernet/intel/e1000/e1000_main.c | 6 +- >> drivers/net/ethernet/intel/igb/igb.h | 25 +-- >> drivers/net/ethernet/intel/igb/igb_main.c | 149 +++++++------ >> drivers/net/ethernet/intel/igbvf/netdev.c | 6 +- >> drivers/net/ethernet/marvell/mvneta.c | 9 +- >> drivers/net/ethernet/marvell/sky2.h | 2 +- >> drivers/tee/optee/call.c | 7 +- >> drivers/tee/optee/smc_abi.c | 2 +- >> drivers/virtio/virtio_balloon.c | 10 +- >> drivers/xen/balloon.c | 11 +- >> drivers/xen/biomerge.c | 12 +- >> drivers/xen/privcmd.c | 2 +- >> drivers/xen/xenbus/xenbus_client.c | 5 +- >> drivers/xen/xlate_mmu.c | 6 +- >> fs/binfmt_elf.c | 11 +- >> fs/buffer.c | 2 +- >> fs/coredump.c | 8 +- >> fs/ext4/ext4.h | 36 ++-- >> fs/ext4/move_extent.c | 2 +- >> fs/ext4/readpage.c | 2 +- >> fs/fat/dir.c | 4 +- >> fs/fat/fatent.c | 4 +- >> fs/nfs/nfs42proc.c | 2 +- >> fs/nfs/nfs42xattr.c | 2 +- >> fs/nfs/nfs4proc.c | 2 +- >> include/asm-generic/pgtable-geometry.h | 71 +++++++ >> include/asm-generic/vmlinux.lds.h | 38 ++-- >> include/linux/buffer_head.h | 1 + >> include/linux/cpumask.h | 5 + >> include/linux/linkage.h | 4 +- >> include/linux/mm.h | 17 +- >> include/linux/mm_types.h | 15 +- >> include/linux/mm_types_task.h | 2 +- >> include/linux/mmzone.h | 3 +- >> include/linux/netlink.h | 6 +- >> include/linux/percpu-defs.h | 4 +- >> include/linux/perf_event.h | 2 +- >> include/linux/sched.h | 4 +- >> include/linux/slab.h | 7 +- >> include/linux/stackdepot.h | 6 +- >> include/linux/sunrpc/svc.h | 8 +- >> include/linux/sunrpc/svc_rdma.h | 4 +- >> include/linux/sunrpc/svcsock.h | 2 +- >> include/linux/swap.h | 17 +- >> include/linux/swapops.h | 6 +- >> include/linux/thread_info.h | 10 +- >> include/xen/page.h | 2 + >> init/main.c | 7 +- >> kernel/bpf/core.c | 9 +- >> kernel/bpf/ringbuf.c | 54 ++--- >> kernel/cgroup/cgroup.c | 8 +- >> kernel/crash_core.c | 2 +- >> kernel/events/core.c | 2 +- >> kernel/fork.c | 71 +++---- >> kernel/power/power.h | 2 +- >> kernel/power/snapshot.c | 2 +- >> kernel/power/swap.c | 129 +++++++++-- >> kernel/trace/fgraph.c | 2 +- >> kernel/trace/trace.c | 2 +- >> lib/stackdepot.c | 6 +- >> mm/kasan/report.c | 3 +- >> mm/memcontrol.c | 11 +- >> mm/memory.c | 4 +- >> mm/mmap.c | 2 +- >> mm/page-writeback.c | 2 +- >> mm/page_alloc.c | 31 +-- >> mm/slub.c | 2 +- >> mm/sparse.c | 2 +- >> mm/swapfile.c | 2 +- >> mm/vmalloc.c | 7 +- >> net/9p/trans_virtio.c | 4 +- >> net/core/hotdata.c | 4 +- >> net/core/skbuff.c | 4 +- >> net/core/sysctl_net_core.c | 2 +- >> net/sunrpc/cache.c | 3 +- >> net/unix/af_unix.c | 2 +- >> sound/soc/soc-utils.c | 4 +- >> virt/kvm/kvm_main.c | 2 +- >> 172 files changed, 2185 insertions(+), 951 deletions(-) >> create mode 100644 arch/arm64/include/asm/pgtable-geometry.h >> create mode 100644 arch/arm64/kvm/hyp/nvhe/pgtable-geometry.c >> create mode 100644 arch/arm64/mm/pgtable-geometry.c >> create mode 100644 include/asm-generic/pgtable-geometry.h >> >> -- >> 2.43.0 >>
On Mon, 14 Oct 2024 11:55:11 +0100 Ryan Roberts <ryan.roberts@arm.com> wrote: >[...] > The series is arranged as follows: > > - patch 1: Add macros required for converting non-arch code to support > boot-time page size selection > - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from all > non-arch code I have just tried to recompile the openSUSE kernel with these patches applied, and I'm running into this: CC arch/arm64/hyperv/hv_core.o In file included from ../arch/arm64/hyperv/hv_core.c:14:0: ../include/linux/hyperv.h:158:5: error: variably modified ‘reserved2’ at file scope u8 reserved2[PAGE_SIZE - 68]; ^~~~~~~~~ It looks like one more place which needs a patch, right? Petr T
On 17/10/2024 13:27, Petr Tesarik wrote: > On Mon, 14 Oct 2024 11:55:11 +0100 > Ryan Roberts <ryan.roberts@arm.com> wrote: > >> [...] >> The series is arranged as follows: >> >> - patch 1: Add macros required for converting non-arch code to support >> boot-time page size selection >> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from all >> non-arch code > > I have just tried to recompile the openSUSE kernel with these patches > applied, and I'm running into this: > > CC arch/arm64/hyperv/hv_core.o > In file included from ../arch/arm64/hyperv/hv_core.c:14:0: > ../include/linux/hyperv.h:158:5: error: variably modified ‘reserved2’ at file scope > u8 reserved2[PAGE_SIZE - 68]; > ^~~~~~~~~ > > It looks like one more place which needs a patch, right? As mentioned in the cover letter, so far I've only converted enough to get the defconfig *image* building (i.e. no modules). If you are compiling a different config or compiling the modules for defconfig, you will likely run into these types of issues. That said, I do have some patches to fix Hyper-V, which Michael Kelley was kind enough to send me. I understand that Suse might be able to help with wider performance testing - if that's the reason you are trying to compile, you could send me your config and I'll start working on fixing up other drivers? Thanks, Ryan > > Petr T
Hi Ryan, On Thu, 17 Oct 2024 13:32:43 +0100 Ryan Roberts <ryan.roberts@arm.com> wrote: >[...] > I understand that Suse might be able to help with wider performance testing Sorry for the delay (vacation, other tasks). Anyway, let me share some results with you. First, I have looked only at 4k pages (constant v. selected at boot time) so far. Second, the impact of the patch series is much smaller than I expected. Most macro-benchmarks (dbench, io-bench) did not see any significant slowdown. There appears to be a performance hit of approx. 1-2%, but that's within noise, and I can't dedicate my time to running extensive tests to find the distribution peak and compare. In short, I suspect a slight performance hit, but I cannot quantify it. Third, a few micro-benchmarks saw a significant regression. Most notably, getenv and getenvT2 tests from libMicro were 18% and 20% slower with variable page size. I don't know why, but I'm looking into it. The system() library call was also about 18% slower, but that might be related. The dup() syscall was up to 5% slower (depends on underlying filesystem type). VMA unmap was slower for some sizes, but the pattern seemed random, sometimes giving even better performance with variable page size, so this micro-benchmark may be too unstable to draw any conclusions. Stay tuned Petr T
Hi Petr, On 11/11/2024 12:14, Petr Tesarik wrote: > Hi Ryan, > > On Thu, 17 Oct 2024 13:32:43 +0100 > Ryan Roberts <ryan.roberts@arm.com> wrote: > >> [...] >> I understand that Suse might be able to help with wider performance testing > > Sorry for the delay (vacation, other tasks). Anyway, let me share some > results with you. Not at all; thanks for coming back with these results! > > First, I have looked only at 4k pages (constant v. selected at boot > time) so far. > > Second, the impact of the patch series is much smaller than I expected. > Most macro-benchmarks (dbench, io-bench) did not see any significant > slowdown. There appears to be a performance hit of approx. 1-2%, but > that's within noise, and I can't dedicate my time to running extensive > tests to find the distribution peak and compare. In short, I suspect a > slight performance hit, but I cannot quantify it. > > Third, a few micro-benchmarks saw a significant regression. > > Most notably, getenv and getenvT2 tests from libMicro were 18% and 20% > slower with variable page size. I don't know why, but I'm looking into > it. The system() library call was also about 18% slower, but that might > be related. OK, ouch. I think there are some things we can try to optimize the implementation further. But I'll wait for your analysis before digging myself. You probably also saw the conversation with Catalin about the cost vs benefit of this series. Performance regressions will all need to be considered in the cost column, of course. So understanding the root cause and trying to reduce the regression as much as possible will increase chances of getting it accepted upstream. Thanks, Ryan > > The dup() syscall was up to 5% slower (depends on underlying filesystem > type). > > VMA unmap was slower for some sizes, but the pattern seemed random, > sometimes giving even better performance with variable page size, so > this micro-benchmark may be too unstable to draw any conclusions. > > Stay tuned > Petr T
On Mon, 11 Nov 2024 12:25:35 +0000 Ryan Roberts <ryan.roberts@arm.com> wrote: > Hi Petr, > > On 11/11/2024 12:14, Petr Tesarik wrote: > > Hi Ryan, > > > > On Thu, 17 Oct 2024 13:32:43 +0100 > > Ryan Roberts <ryan.roberts@arm.com> wrote: >[...] > > Third, a few micro-benchmarks saw a significant regression. > > > > Most notably, getenv and getenvT2 tests from libMicro were 18% and 20% > > slower with variable page size. I don't know why, but I'm looking into > > it. The system() library call was also about 18% slower, but that might > > be related. > > OK, ouch. I think there are some things we can try to optimize the > implementation further. But I'll wait for your analysis before digging myself. This turned out to be a false positive. The way this microbenchmark was invoked did not get enough samples, so it was mostly dependent on whether caches were hot or cold, and the timing on this specific system with the specific sequence of bencnmarks in the suite happens to favour my baseline kernel. After increasing the batch count, I'm getting pretty much the same performance for 6.11 vanilla and patched kernels: prc thr usecs/call samples errors cnt/samp getenv (baseline) 1 1 0.14975 99 0 100000 getenv (patched) 1 1 0.14981 92 0 100000 > You probably also saw the conversation with Catalin about the cost vs benefit of > this series. Performance regressions will all need to be considered in the cost > column, of course. So understanding the root cause and trying to reduce the > regression as much as possible will increase chances of getting it accepted > upstream. Yes. Now that the biggest number is off the table, I'm going to: - look into the dup() slowdown - verify whether VMA split/merge operations are indeed slower Petr T
On 12/11/2024 09:45, Petr Tesarik wrote: > On Mon, 11 Nov 2024 12:25:35 +0000 > Ryan Roberts <ryan.roberts@arm.com> wrote: > >> Hi Petr, >> >> On 11/11/2024 12:14, Petr Tesarik wrote: >>> Hi Ryan, >>> >>> On Thu, 17 Oct 2024 13:32:43 +0100 >>> Ryan Roberts <ryan.roberts@arm.com> wrote: >> [...] >>> Third, a few micro-benchmarks saw a significant regression. >>> >>> Most notably, getenv and getenvT2 tests from libMicro were 18% and 20% >>> slower with variable page size. I don't know why, but I'm looking into >>> it. The system() library call was also about 18% slower, but that might >>> be related. >> >> OK, ouch. I think there are some things we can try to optimize the >> implementation further. But I'll wait for your analysis before digging myself. > > This turned out to be a false positive. The way this microbenchmark was > invoked did not get enough samples, so it was mostly dependent on > whether caches were hot or cold, and the timing on this specific system > with the specific sequence of bencnmarks in the suite happens to favour > my baseline kernel. > > After increasing the batch count, I'm getting pretty much the same > performance for 6.11 vanilla and patched kernels: > > prc thr usecs/call samples errors cnt/samp > getenv (baseline) 1 1 0.14975 99 0 100000 > getenv (patched) 1 1 0.14981 92 0 100000 Oh that's good news! Does this account for all 3 of the above tests (getenv, getenvT2 and system())? > >> You probably also saw the conversation with Catalin about the cost vs benefit of >> this series. Performance regressions will all need to be considered in the cost >> column, of course. So understanding the root cause and trying to reduce the >> regression as much as possible will increase chances of getting it accepted >> upstream. > > Yes. Now that the biggest number is off the table, I'm going to: > > - look into the dup() slowdown > - verify whether VMA split/merge operations are indeed slower > > Petr T
On Tue, 12 Nov 2024 10:19:34 +0000 Ryan Roberts <ryan.roberts@arm.com> wrote: > On 12/11/2024 09:45, Petr Tesarik wrote: > > On Mon, 11 Nov 2024 12:25:35 +0000 > > Ryan Roberts <ryan.roberts@arm.com> wrote: > > > >> Hi Petr, > >> > >> On 11/11/2024 12:14, Petr Tesarik wrote: > >>> Hi Ryan, > >>> > >>> On Thu, 17 Oct 2024 13:32:43 +0100 > >>> Ryan Roberts <ryan.roberts@arm.com> wrote: > >> [...] > >>> Third, a few micro-benchmarks saw a significant regression. > >>> > >>> Most notably, getenv and getenvT2 tests from libMicro were 18% and 20% > >>> slower with variable page size. I don't know why, but I'm looking into > >>> it. The system() library call was also about 18% slower, but that might > >>> be related. > >> > >> OK, ouch. I think there are some things we can try to optimize the > >> implementation further. But I'll wait for your analysis before digging myself. > > > > This turned out to be a false positive. The way this microbenchmark was > > invoked did not get enough samples, so it was mostly dependent on > > whether caches were hot or cold, and the timing on this specific system > > with the specific sequence of bencnmarks in the suite happens to favour > > my baseline kernel. > > > > After increasing the batch count, I'm getting pretty much the same > > performance for 6.11 vanilla and patched kernels: > > > > prc thr usecs/call samples errors cnt/samp > > getenv (baseline) 1 1 0.14975 99 0 100000 > > getenv (patched) 1 1 0.14981 92 0 100000 > > Oh that's good news! Does this account for all 3 of the above tests (getenv, > getenvT2 and system())? It does for getenvT2 (a variant of the test with 2 threads), but not for system. Thanks for asking, I forgot about that one. I'm getting substantial difference there (+29% on average over 100 runs): prc thr usecs/call samples errors cnt/samp command system (baseline) 1 1 6937.18016 102 0 100 A=$$ system (patched) 1 1 8959.48032 102 0 100 A=$$ So, yeah, this should in fact be my priority #1. The "system" benchmark measures the duration of system("A=$$"), which involves starting the system shell (in my case bash-4.4.23), so this is not really a microbenchmark. I hope perf can help match the difference to a kernel API. Petr T
On Tue, 12 Nov 2024 11:50:39 +0100 Petr Tesarik <ptesarik@suse.com> wrote: > On Tue, 12 Nov 2024 10:19:34 +0000 > Ryan Roberts <ryan.roberts@arm.com> wrote: > > > On 12/11/2024 09:45, Petr Tesarik wrote: > > > On Mon, 11 Nov 2024 12:25:35 +0000 > > > Ryan Roberts <ryan.roberts@arm.com> wrote: > > > > > >> Hi Petr, > > >> > > >> On 11/11/2024 12:14, Petr Tesarik wrote: > > >>> Hi Ryan, > > >>> > > >>> On Thu, 17 Oct 2024 13:32:43 +0100 > > >>> Ryan Roberts <ryan.roberts@arm.com> wrote: > > >> [...] > > >>> Third, a few micro-benchmarks saw a significant regression. > > >>> > > >>> Most notably, getenv and getenvT2 tests from libMicro were 18% and 20% > > >>> slower with variable page size. I don't know why, but I'm looking into > > >>> it. The system() library call was also about 18% slower, but that might > > >>> be related. > > >> > > >> OK, ouch. I think there are some things we can try to optimize the > > >> implementation further. But I'll wait for your analysis before digging myself. > > > > > > This turned out to be a false positive. The way this microbenchmark was > > > invoked did not get enough samples, so it was mostly dependent on > > > whether caches were hot or cold, and the timing on this specific system > > > with the specific sequence of bencnmarks in the suite happens to favour > > > my baseline kernel. > > > > > > After increasing the batch count, I'm getting pretty much the same > > > performance for 6.11 vanilla and patched kernels: > > > > > > prc thr usecs/call samples errors cnt/samp > > > getenv (baseline) 1 1 0.14975 99 0 100000 > > > getenv (patched) 1 1 0.14981 92 0 100000 > > > > Oh that's good news! Does this account for all 3 of the above tests (getenv, > > getenvT2 and system())? > > It does for getenvT2 (a variant of the test with 2 threads), but not > for system. Thanks for asking, I forgot about that one. > > I'm getting substantial difference there (+29% on average over 100 runs): > > prc thr usecs/call samples errors cnt/samp command > system (baseline) 1 1 6937.18016 102 0 100 A=$$ > system (patched) 1 1 8959.48032 102 0 100 A=$$ > > So, yeah, this should in fact be my priority #1. Further testing reveals the workload is bimodal, that is to say the distribution of results has two peaks. The first peak around 3.2 ms covers 30% runs, the second peak around 15.7 ms covers 11%. Two per cent are faster than the fast peak, 5% are slower than slow peak, the rest is distributed almost evenly between them. 100 samples were not sufficient to see this distribution, and it was mere bad luck that only the patched kernel originally reported bad results. I can now see bad results even with the unpatched kernel. In short, I don't think there is a difference in system() performance. I will still have a look at dup() and VMA performance, but so far it all looks good to me. Good job! ;-) I will also try running a more complete set of benchmarks during next week. That's SUSE Hack Week, and I want to make a PoC for the MM changes I proposed at LPC24, so I won't need this Ampere system for interactive use. Petr T
On 13/11/2024 12:40, Petr Tesarik wrote: > On Tue, 12 Nov 2024 11:50:39 +0100 > Petr Tesarik <ptesarik@suse.com> wrote: > >> On Tue, 12 Nov 2024 10:19:34 +0000 >> Ryan Roberts <ryan.roberts@arm.com> wrote: >> >>> On 12/11/2024 09:45, Petr Tesarik wrote: >>>> On Mon, 11 Nov 2024 12:25:35 +0000 >>>> Ryan Roberts <ryan.roberts@arm.com> wrote: >>>> >>>>> Hi Petr, >>>>> >>>>> On 11/11/2024 12:14, Petr Tesarik wrote: >>>>>> Hi Ryan, >>>>>> >>>>>> On Thu, 17 Oct 2024 13:32:43 +0100 >>>>>> Ryan Roberts <ryan.roberts@arm.com> wrote: >>>>> [...] >>>>>> Third, a few micro-benchmarks saw a significant regression. >>>>>> >>>>>> Most notably, getenv and getenvT2 tests from libMicro were 18% and 20% >>>>>> slower with variable page size. I don't know why, but I'm looking into >>>>>> it. The system() library call was also about 18% slower, but that might >>>>>> be related. >>>>> >>>>> OK, ouch. I think there are some things we can try to optimize the >>>>> implementation further. But I'll wait for your analysis before digging myself. >>>> >>>> This turned out to be a false positive. The way this microbenchmark was >>>> invoked did not get enough samples, so it was mostly dependent on >>>> whether caches were hot or cold, and the timing on this specific system >>>> with the specific sequence of bencnmarks in the suite happens to favour >>>> my baseline kernel. >>>> >>>> After increasing the batch count, I'm getting pretty much the same >>>> performance for 6.11 vanilla and patched kernels: >>>> >>>> prc thr usecs/call samples errors cnt/samp >>>> getenv (baseline) 1 1 0.14975 99 0 100000 >>>> getenv (patched) 1 1 0.14981 92 0 100000 >>> >>> Oh that's good news! Does this account for all 3 of the above tests (getenv, >>> getenvT2 and system())? >> >> It does for getenvT2 (a variant of the test with 2 threads), but not >> for system. Thanks for asking, I forgot about that one. >> >> I'm getting substantial difference there (+29% on average over 100 runs): >> >> prc thr usecs/call samples errors cnt/samp command >> system (baseline) 1 1 6937.18016 102 0 100 A=$$ >> system (patched) 1 1 8959.48032 102 0 100 A=$$ >> >> So, yeah, this should in fact be my priority #1. > > Further testing reveals the workload is bimodal, that is to say the > distribution of results has two peaks. The first peak around 3.2 ms > covers 30% runs, the second peak around 15.7 ms covers 11%. Two per > cent are faster than the fast peak, 5% are slower than slow peak, the > rest is distributed almost evenly between them. FWIW, One source of bimodality I've seen on Ampere systems with 2 NUMA nodes is placement of the kernel image vs placement of the running thread. If they are remote from eachother, you'll see a slowdown. I've hacked this source away in the past by effectively using only a single NUMA node (with the help of 'maxcpus' and 'mem' kernel cmdline options). > > 100 samples were not sufficient to see this distribution, and it was > mere bad luck that only the patched kernel originally reported bad > results. I can now see bad results even with the unpatched kernel. > > In short, I don't think there is a difference in system() performance. > > I will still have a look at dup() and VMA performance, but so far it > all looks good to me. Good job! ;-) Thanks for digging into all this! > > I will also try running a more complete set of benchmarks during next > week. That's SUSE Hack Week, and I want to make a PoC for the MM > changes I proposed at LPC24, so I won't need this Ampere system for > interactive use. > > Petr T
On Wed, 13 Nov 2024 12:56:24 +0000 Ryan Roberts <ryan.roberts@arm.com> wrote: > On 13/11/2024 12:40, Petr Tesarik wrote: > > On Tue, 12 Nov 2024 11:50:39 +0100 > > Petr Tesarik <ptesarik@suse.com> wrote: > > > >> On Tue, 12 Nov 2024 10:19:34 +0000 > >> Ryan Roberts <ryan.roberts@arm.com> wrote: > >> > >>> On 12/11/2024 09:45, Petr Tesarik wrote: > >>>> On Mon, 11 Nov 2024 12:25:35 +0000 > >>>> Ryan Roberts <ryan.roberts@arm.com> wrote: > >>>> > >>>>> Hi Petr, > >>>>> > >>>>> On 11/11/2024 12:14, Petr Tesarik wrote: > >>>>>> Hi Ryan, > >>>>>> > >>>>>> On Thu, 17 Oct 2024 13:32:43 +0100 > >>>>>> Ryan Roberts <ryan.roberts@arm.com> wrote: > >>>>> [...] > >>>>>> Third, a few micro-benchmarks saw a significant regression. > >>>>>> > >>>>>> Most notably, getenv and getenvT2 tests from libMicro were 18% and 20% > >>>>>> slower with variable page size. I don't know why, but I'm looking into > >>>>>> it. The system() library call was also about 18% slower, but that might > >>>>>> be related. > >>>>> > >>>>> OK, ouch. I think there are some things we can try to optimize the > >>>>> implementation further. But I'll wait for your analysis before digging myself. > >>>> > >>>> This turned out to be a false positive. The way this microbenchmark was > >>>> invoked did not get enough samples, so it was mostly dependent on > >>>> whether caches were hot or cold, and the timing on this specific system > >>>> with the specific sequence of bencnmarks in the suite happens to favour > >>>> my baseline kernel. > >>>> > >>>> After increasing the batch count, I'm getting pretty much the same > >>>> performance for 6.11 vanilla and patched kernels: > >>>> > >>>> prc thr usecs/call samples errors cnt/samp > >>>> getenv (baseline) 1 1 0.14975 99 0 100000 > >>>> getenv (patched) 1 1 0.14981 92 0 100000 > >>> > >>> Oh that's good news! Does this account for all 3 of the above tests (getenv, > >>> getenvT2 and system())? > >> > >> It does for getenvT2 (a variant of the test with 2 threads), but not > >> for system. Thanks for asking, I forgot about that one. > >> > >> I'm getting substantial difference there (+29% on average over 100 runs): > >> > >> prc thr usecs/call samples errors cnt/samp command > >> system (baseline) 1 1 6937.18016 102 0 100 A=$$ > >> system (patched) 1 1 8959.48032 102 0 100 A=$$ > >> > >> So, yeah, this should in fact be my priority #1. > > > > Further testing reveals the workload is bimodal, that is to say the > > distribution of results has two peaks. The first peak around 3.2 ms > > covers 30% runs, the second peak around 15.7 ms covers 11%. Two per > > cent are faster than the fast peak, 5% are slower than slow peak, the > > rest is distributed almost evenly between them. > > FWIW, One source of bimodality I've seen on Ampere systems with 2 NUMA nodes is > placement of the kernel image vs placement of the running thread. If they are > remote from eachother, you'll see a slowdown. I've hacked this source away in > the past by effectively using only a single NUMA node (with the help of > 'maxcpus' and 'mem' kernel cmdline options). This system has only one NUMA node. But your comment leads in the right direction. CPU placement does play a role here. I can consistently get the fast results if I pin the benchmark process to a single CPU core, or more generally to a CPU set which shares the L2 cache (as found on eMAG). But the scheduler only considers LLC, which (with CONFIG_SCHED_CLUSTER=y) follows the complex affinity of the SLC. Long story short, without explicit affinity, the scheduler may place a forked child onto a CPU with a cold L2 cache, which harms short-lived processes (like the ones created by this benchmark). Now it all makes sense and it is totally unrelated to dynamic page size selection. :-) Petr T
On 10/17/2024 8:32 AM, Ryan Roberts wrote: > On 17/10/2024 13:27, Petr Tesarik wrote: >> On Mon, 14 Oct 2024 11:55:11 +0100 >> Ryan Roberts <ryan.roberts@arm.com> wrote: >> >>> [...] >>> The series is arranged as follows: >>> >>> - patch 1: Add macros required for converting non-arch code to support >>> boot-time page size selection >>> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from all >>> non-arch code >> I have just tried to recompile the openSUSE kernel with these patches >> applied, and I'm running into this: >> >> CC arch/arm64/hyperv/hv_core.o >> In file included from ../arch/arm64/hyperv/hv_core.c:14:0: >> ../include/linux/hyperv.h:158:5: error: variably modified ‘reserved2’ at file scope >> u8 reserved2[PAGE_SIZE - 68]; >> ^~~~~~~~~ >> >> It looks like one more place which needs a patch, right? > As mentioned in the cover letter, so far I've only converted enough to get the > defconfig *image* building (i.e. no modules). If you are compiling a different > config or compiling the modules for defconfig, you will likely run into these > types of issues. It would be nice if you could provide the defconfig you are using; I also ran into build issues when using the arch/arm64/configs/defconfig. Thank you, Thomas > > That said, I do have some patches to fix Hyper-V, which Michael Kelley was kind > enough to send me. > > I understand that Suse might be able to help with wider performance testing - if > that's the reason you are trying to compile, you could send me your config and > I'll start working on fixing up other drivers? > > Thanks, > Ryan > >> Petr T >
On 23/10/2024 22:00, Thomas Tai wrote: > > On 10/17/2024 8:32 AM, Ryan Roberts wrote: >> On 17/10/2024 13:27, Petr Tesarik wrote: >>> On Mon, 14 Oct 2024 11:55:11 +0100 >>> Ryan Roberts <ryan.roberts@arm.com> wrote: >>> >>>> [...] >>>> The series is arranged as follows: >>>> >>>> - patch 1: Add macros required for converting non-arch code to support >>>> boot-time page size selection >>>> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from all >>>> non-arch code >>> I have just tried to recompile the openSUSE kernel with these patches >>> applied, and I'm running into this: >>> >>> CC arch/arm64/hyperv/hv_core.o >>> In file included from ../arch/arm64/hyperv/hv_core.c:14:0: >>> ../include/linux/hyperv.h:158:5: error: variably modified ‘reserved2’ at file >>> scope >>> u8 reserved2[PAGE_SIZE - 68]; >>> ^~~~~~~~~ >>> >>> It looks like one more place which needs a patch, right? >> As mentioned in the cover letter, so far I've only converted enough to get the >> defconfig *image* building (i.e. no modules). If you are compiling a different >> config or compiling the modules for defconfig, you will likely run into these >> types of issues. > > It would be nice if you could provide the defconfig you are using; I also ran > into build issues when using the arch/arm64/configs/defconfig. git clean -xdfq make defconfig # Set CONFIG_ARM64_BOOT_TIME_PAGE_SIZE ./scripts/config --disable CONFIG_ARM64_4K_PAGES ./scripts/config --disable CONFIG_ARM64_16K_PAGES ./scripts/config --disable CONFIG_ARM64_64K_PAGES ./scripts/config --disable CONFIG_ARM64_BOOT_TIME_PAGE_SIZE ./scripts/config --enable CONFIG_ARM64_BOOT_TIME_PAGE_SIZE # Set ARM64_VA_BITS_48 ./scripts/config --disable ARM64_VA_BITS_36 ./scripts/config --disable ARM64_VA_BITS_39 ./scripts/config --disable ARM64_VA_BITS_42 ./scripts/config --disable ARM64_VA_BITS_47 ./scripts/config --disable ARM64_VA_BITS_48 ./scripts/config --disable ARM64_VA_BITS_52 ./scripts/config --enable ARM64_VA_BITS_48 # Optional: filesystems known to compile with boot-time page size ./scripts/config --enable CONFIG_SQUASHFS_LZ4 ./scripts/config --enable CONFIG_SQUASHFS_LZO ./scripts/config --enable CONFIG_SQUASHFS_XZ ./scripts/config --enable CONFIG_SQUASHFS_ZSTD ./scripts/config --enable CONFIG_XFS_FS # Optional: trace stuff known to compile with boot-time page size ./scripts/config --enable CONFIG_FTRACE ./scripts/config --enable CONFIG_FUNCTION_TRACER ./scripts/config --enable CONFIG_KPROBES ./scripts/config --enable CONFIG_HIST_TRIGGERS ./scripts/config --enable CONFIG_FTRACE_SYSCALLS # Optional: misc mm stuff known to compile with boot-time page size ./scripts/config --enable CONFIG_PTDUMP_DEBUGFS ./scripts/config --enable CONFIG_READ_ONLY_THP_FOR_FS ./scripts/config --enable CONFIG_USERFAULTFD # Optional: mm debug stuff known compile with boot-time page size ./scripts/config --enable CONFIG_DEBUG_VM ./scripts/config --enable CONFIG_DEBUG_VM_MAPLE_TREE ./scripts/config --enable CONFIG_DEBUG_VM_RB ./scripts/config --enable CONFIG_DEBUG_VM_PGFLAGS ./scripts/config --enable CONFIG_DEBUG_VM_PGTABLE ./scripts/config --enable CONFIG_PAGE_TABLE_CHECK ./scripts/config --enable CONFIG_PAGE_TABLE_CHECK_ENFORCED make olddefconfig make -s -j`nproc` Image So I'm explicitly only building and booting the kernel image, not the modules. The kernel image contains all the drivers needed to get a VM up and running under QEMU/KVM. Thanks, Ryan > > Thank you, > Thomas > >> >> That said, I do have some patches to fix Hyper-V, which Michael Kelley was kind >> enough to send me. >> >> I understand that Suse might be able to help with wider performance testing - if >> that's the reason you are trying to compile, you could send me your config and >> I'll start working on fixing up other drivers? >> >> Thanks, >> Ryan >> >>> Petr T >>
On 24/10/24 16:18, Ryan Roberts wrote: > External email: Use caution opening links or attachments > > > On 23/10/2024 22:00, Thomas Tai wrote: >> >> On 10/17/2024 8:32 AM, Ryan Roberts wrote: >>> On 17/10/2024 13:27, Petr Tesarik wrote: >>>> On Mon, 14 Oct 2024 11:55:11 +0100 >>>> Ryan Roberts <ryan.roberts@arm.com> wrote: >>>> >>>>> [...] >>>>> The series is arranged as follows: >>>>> >>>>> - patch 1: Add macros required for converting non-arch code to support >>>>> boot-time page size selection >>>>> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from all >>>>> non-arch code >>>> I have just tried to recompile the openSUSE kernel with these patches >>>> applied, and I'm running into this: >>>> >>>> CC arch/arm64/hyperv/hv_core.o >>>> In file included from ../arch/arm64/hyperv/hv_core.c:14:0: >>>> ../include/linux/hyperv.h:158:5: error: variably modified ‘reserved2’ at file >>>> scope >>>> u8 reserved2[PAGE_SIZE - 68]; >>>> ^~~~~~~~~ >>>> >>>> It looks like one more place which needs a patch, right? >>> As mentioned in the cover letter, so far I've only converted enough to get the >>> defconfig *image* building (i.e. no modules). If you are compiling a different >>> config or compiling the modules for defconfig, you will likely run into these >>> types of issues. >> >> It would be nice if you could provide the defconfig you are using; I also ran >> into build issues when using the arch/arm64/configs/defconfig. > > git clean -xdfq > make defconfig > > # Set CONFIG_ARM64_BOOT_TIME_PAGE_SIZE > ./scripts/config --disable CONFIG_ARM64_4K_PAGES > ./scripts/config --disable CONFIG_ARM64_16K_PAGES > ./scripts/config --disable CONFIG_ARM64_64K_PAGES > ./scripts/config --disable CONFIG_ARM64_BOOT_TIME_PAGE_SIZE > ./scripts/config --enable CONFIG_ARM64_BOOT_TIME_PAGE_SIZE > > # Set ARM64_VA_BITS_48 > ./scripts/config --disable ARM64_VA_BITS_36 > ./scripts/config --disable ARM64_VA_BITS_39 > ./scripts/config --disable ARM64_VA_BITS_42 > ./scripts/config --disable ARM64_VA_BITS_47 > ./scripts/config --disable ARM64_VA_BITS_48 > ./scripts/config --disable ARM64_VA_BITS_52 > ./scripts/config --enable ARM64_VA_BITS_48 > > # Optional: filesystems known to compile with boot-time page size > ./scripts/config --enable CONFIG_SQUASHFS_LZ4 > ./scripts/config --enable CONFIG_SQUASHFS_LZO > ./scripts/config --enable CONFIG_SQUASHFS_XZ > ./scripts/config --enable CONFIG_SQUASHFS_ZSTD > ./scripts/config --enable CONFIG_XFS_FS > > # Optional: trace stuff known to compile with boot-time page size > ./scripts/config --enable CONFIG_FTRACE > ./scripts/config --enable CONFIG_FUNCTION_TRACER > ./scripts/config --enable CONFIG_KPROBES > ./scripts/config --enable CONFIG_HIST_TRIGGERS > ./scripts/config --enable CONFIG_FTRACE_SYSCALLS > > # Optional: misc mm stuff known to compile with boot-time page size > ./scripts/config --enable CONFIG_PTDUMP_DEBUGFS > ./scripts/config --enable CONFIG_READ_ONLY_THP_FOR_FS > ./scripts/config --enable CONFIG_USERFAULTFD > > # Optional: mm debug stuff known compile with boot-time page size > ./scripts/config --enable CONFIG_DEBUG_VM > ./scripts/config --enable CONFIG_DEBUG_VM_MAPLE_TREE > ./scripts/config --enable CONFIG_DEBUG_VM_RB > ./scripts/config --enable CONFIG_DEBUG_VM_PGFLAGS > ./scripts/config --enable CONFIG_DEBUG_VM_PGTABLE > ./scripts/config --enable CONFIG_PAGE_TABLE_CHECK > ./scripts/config --enable CONFIG_PAGE_TABLE_CHECK_ENFORCED > > make olddefconfig > make -s -j`nproc` Image > > So I'm explicitly only building and booting the kernel image, not the modules. > The kernel image contains all the drivers needed to get a VM up and running > under QEMU/KVM. > > Thanks, > Ryan > Thank you for this patch set. I was able to boot with minimal configs on Tegra234 board. Will enable more configs and discuss. Thank you, Sumit Gupta >> >> Thank you, >> Thomas >> >>> >>> That said, I do have some patches to fix Hyper-V, which Michael Kelley was kind >>> enough to send me. >>> >>> I understand that Suse might be able to help with wider performance testing - if >>> that's the reason you are trying to compile, you could send me your config and >>> I'll start working on fixing up other drivers? >>> >>> Thanks, >>> Ryan >>> >>>> Petr T >>> > >
On Thu, 24 Oct 2024 11:48:55 +0100 Ryan Roberts <ryan.roberts@arm.com> wrote: > On 23/10/2024 22:00, Thomas Tai wrote: > > > > On 10/17/2024 8:32 AM, Ryan Roberts wrote: > >> On 17/10/2024 13:27, Petr Tesarik wrote: > >>> On Mon, 14 Oct 2024 11:55:11 +0100 > >>> Ryan Roberts <ryan.roberts@arm.com> wrote: > >>> > >>>> [...] > >>>> The series is arranged as follows: > >>>> > >>>> - patch 1: Add macros required for converting non-arch code to support > >>>> boot-time page size selection > >>>> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from all > >>>> non-arch code > >>> I have just tried to recompile the openSUSE kernel with these patches > >>> applied, and I'm running into this: > >>> > >>> CC arch/arm64/hyperv/hv_core.o > >>> In file included from ../arch/arm64/hyperv/hv_core.c:14:0: > >>> ../include/linux/hyperv.h:158:5: error: variably modified ‘reserved2’ at file > >>> scope > >>> u8 reserved2[PAGE_SIZE - 68]; > >>> ^~~~~~~~~ > >>> > >>> It looks like one more place which needs a patch, right? > >> As mentioned in the cover letter, so far I've only converted enough to get the > >> defconfig *image* building (i.e. no modules). If you are compiling a different > >> config or compiling the modules for defconfig, you will likely run into these > >> types of issues. > > > > It would be nice if you could provide the defconfig you are using; I also ran > > into build issues when using the arch/arm64/configs/defconfig. > > git clean -xdfq > make defconfig > > # Set CONFIG_ARM64_BOOT_TIME_PAGE_SIZE > ./scripts/config --disable CONFIG_ARM64_4K_PAGES > ./scripts/config --disable CONFIG_ARM64_16K_PAGES > ./scripts/config --disable CONFIG_ARM64_64K_PAGES > ./scripts/config --disable CONFIG_ARM64_BOOT_TIME_PAGE_SIZE > ./scripts/config --enable CONFIG_ARM64_BOOT_TIME_PAGE_SIZE > > # Set ARM64_VA_BITS_48 > ./scripts/config --disable ARM64_VA_BITS_36 > ./scripts/config --disable ARM64_VA_BITS_39 > ./scripts/config --disable ARM64_VA_BITS_42 > ./scripts/config --disable ARM64_VA_BITS_47 > ./scripts/config --disable ARM64_VA_BITS_48 > ./scripts/config --disable ARM64_VA_BITS_52 > ./scripts/config --enable ARM64_VA_BITS_48 > > # Optional: filesystems known to compile with boot-time page size > ./scripts/config --enable CONFIG_SQUASHFS_LZ4 > ./scripts/config --enable CONFIG_SQUASHFS_LZO > ./scripts/config --enable CONFIG_SQUASHFS_XZ > ./scripts/config --enable CONFIG_SQUASHFS_ZSTD > ./scripts/config --enable CONFIG_XFS_FS > > # Optional: trace stuff known to compile with boot-time page size > ./scripts/config --enable CONFIG_FTRACE > ./scripts/config --enable CONFIG_FUNCTION_TRACER > ./scripts/config --enable CONFIG_KPROBES > ./scripts/config --enable CONFIG_HIST_TRIGGERS > ./scripts/config --enable CONFIG_FTRACE_SYSCALLS > > # Optional: misc mm stuff known to compile with boot-time page size > ./scripts/config --enable CONFIG_PTDUMP_DEBUGFS > ./scripts/config --enable CONFIG_READ_ONLY_THP_FOR_FS > ./scripts/config --enable CONFIG_USERFAULTFD > > # Optional: mm debug stuff known compile with boot-time page size > ./scripts/config --enable CONFIG_DEBUG_VM > ./scripts/config --enable CONFIG_DEBUG_VM_MAPLE_TREE > ./scripts/config --enable CONFIG_DEBUG_VM_RB > ./scripts/config --enable CONFIG_DEBUG_VM_PGFLAGS > ./scripts/config --enable CONFIG_DEBUG_VM_PGTABLE > ./scripts/config --enable CONFIG_PAGE_TABLE_CHECK > ./scripts/config --enable CONFIG_PAGE_TABLE_CHECK_ENFORCED > > make olddefconfig > make -s -j`nproc` Image > > So I'm explicitly only building and booting the kernel image, not the modules. > The kernel image contains all the drivers needed to get a VM up and running > under QEMU/KVM. FWIW with the attached patch I was also able to boot the kernel on Ampere Altra bare metal and using modules. Petr T diff --git a/arch/arm64/mm/pgtable-geometry.c b/arch/arm64/mm/pgtable-geometry.c index ba50637f1e9d..4eb074b99654 100644 --- a/arch/arm64/mm/pgtable-geometry.c +++ b/arch/arm64/mm/pgtable-geometry.c @@ -15,8 +15,14 @@ */ int ptg_page_shift __read_mostly; +EXPORT_SYMBOL_GPL(ptg_page_shift); + int ptg_pmd_shift __read_mostly; +EXPORT_SYMBOL_GPL(ptg_pmd_shift); + int ptg_pud_shift __read_mostly; +EXPORT_SYMBOL_GPL(ptg_pud_shift); + int ptg_p4d_shift __read_mostly; int ptg_pgdir_shift __read_mostly; int ptg_cont_pte_shift __read_mostly;
On 24/10/2024 12:45, Petr Tesarik wrote: > On Thu, 24 Oct 2024 11:48:55 +0100 > Ryan Roberts <ryan.roberts@arm.com> wrote: > >> On 23/10/2024 22:00, Thomas Tai wrote: >>> >>> On 10/17/2024 8:32 AM, Ryan Roberts wrote: >>>> On 17/10/2024 13:27, Petr Tesarik wrote: >>>>> On Mon, 14 Oct 2024 11:55:11 +0100 >>>>> Ryan Roberts <ryan.roberts@arm.com> wrote: >>>>> >>>>>> [...] >>>>>> The series is arranged as follows: >>>>>> >>>>>> - patch 1: Add macros required for converting non-arch code to support >>>>>> boot-time page size selection >>>>>> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from all >>>>>> non-arch code >>>>> I have just tried to recompile the openSUSE kernel with these patches >>>>> applied, and I'm running into this: >>>>> >>>>> CC arch/arm64/hyperv/hv_core.o >>>>> In file included from ../arch/arm64/hyperv/hv_core.c:14:0: >>>>> ../include/linux/hyperv.h:158:5: error: variably modified ‘reserved2’ at file >>>>> scope >>>>> u8 reserved2[PAGE_SIZE - 68]; >>>>> ^~~~~~~~~ >>>>> >>>>> It looks like one more place which needs a patch, right? >>>> As mentioned in the cover letter, so far I've only converted enough to get the >>>> defconfig *image* building (i.e. no modules). If you are compiling a different >>>> config or compiling the modules for defconfig, you will likely run into these >>>> types of issues. >>> >>> It would be nice if you could provide the defconfig you are using; I also ran >>> into build issues when using the arch/arm64/configs/defconfig. >> >> git clean -xdfq >> make defconfig >> >> # Set CONFIG_ARM64_BOOT_TIME_PAGE_SIZE >> ./scripts/config --disable CONFIG_ARM64_4K_PAGES >> ./scripts/config --disable CONFIG_ARM64_16K_PAGES >> ./scripts/config --disable CONFIG_ARM64_64K_PAGES >> ./scripts/config --disable CONFIG_ARM64_BOOT_TIME_PAGE_SIZE >> ./scripts/config --enable CONFIG_ARM64_BOOT_TIME_PAGE_SIZE >> >> # Set ARM64_VA_BITS_48 >> ./scripts/config --disable ARM64_VA_BITS_36 >> ./scripts/config --disable ARM64_VA_BITS_39 >> ./scripts/config --disable ARM64_VA_BITS_42 >> ./scripts/config --disable ARM64_VA_BITS_47 >> ./scripts/config --disable ARM64_VA_BITS_48 >> ./scripts/config --disable ARM64_VA_BITS_52 >> ./scripts/config --enable ARM64_VA_BITS_48 >> >> # Optional: filesystems known to compile with boot-time page size >> ./scripts/config --enable CONFIG_SQUASHFS_LZ4 >> ./scripts/config --enable CONFIG_SQUASHFS_LZO >> ./scripts/config --enable CONFIG_SQUASHFS_XZ >> ./scripts/config --enable CONFIG_SQUASHFS_ZSTD >> ./scripts/config --enable CONFIG_XFS_FS >> >> # Optional: trace stuff known to compile with boot-time page size >> ./scripts/config --enable CONFIG_FTRACE >> ./scripts/config --enable CONFIG_FUNCTION_TRACER >> ./scripts/config --enable CONFIG_KPROBES >> ./scripts/config --enable CONFIG_HIST_TRIGGERS >> ./scripts/config --enable CONFIG_FTRACE_SYSCALLS >> >> # Optional: misc mm stuff known to compile with boot-time page size >> ./scripts/config --enable CONFIG_PTDUMP_DEBUGFS >> ./scripts/config --enable CONFIG_READ_ONLY_THP_FOR_FS >> ./scripts/config --enable CONFIG_USERFAULTFD >> >> # Optional: mm debug stuff known compile with boot-time page size >> ./scripts/config --enable CONFIG_DEBUG_VM >> ./scripts/config --enable CONFIG_DEBUG_VM_MAPLE_TREE >> ./scripts/config --enable CONFIG_DEBUG_VM_RB >> ./scripts/config --enable CONFIG_DEBUG_VM_PGFLAGS >> ./scripts/config --enable CONFIG_DEBUG_VM_PGTABLE >> ./scripts/config --enable CONFIG_PAGE_TABLE_CHECK >> ./scripts/config --enable CONFIG_PAGE_TABLE_CHECK_ENFORCED >> >> make olddefconfig >> make -s -j`nproc` Image >> >> So I'm explicitly only building and booting the kernel image, not the modules. >> The kernel image contains all the drivers needed to get a VM up and running >> under QEMU/KVM. > > FWIW with the attached patch I was also able to boot the kernel on > Ampere Altra bare metal and using modules. Nice! Thanks for the below. That was already reported and I have a fix in my branch at [1]. That also includes the btrfs patch you sent and the hyper-v patches, as well as other fixups from review. [1] https://gitlab.arm.com/linux-arm/linux-rr/-/tree/features/boot-time-page-size-v2-wip Thanks, Ryan > > Petr T > > diff --git a/arch/arm64/mm/pgtable-geometry.c b/arch/arm64/mm/pgtable-geometry.c > index ba50637f1e9d..4eb074b99654 100644 > --- a/arch/arm64/mm/pgtable-geometry.c > +++ b/arch/arm64/mm/pgtable-geometry.c > @@ -15,8 +15,14 @@ > */ > > int ptg_page_shift __read_mostly; > +EXPORT_SYMBOL_GPL(ptg_page_shift); > + > int ptg_pmd_shift __read_mostly; > +EXPORT_SYMBOL_GPL(ptg_pmd_shift); > + > int ptg_pud_shift __read_mostly; > +EXPORT_SYMBOL_GPL(ptg_pud_shift); > + > int ptg_p4d_shift __read_mostly; > int ptg_pgdir_shift __read_mostly; > int ptg_cont_pte_shift __read_mostly;
On Thu, 17 Oct 2024 13:32:43 +0100 Ryan Roberts <ryan.roberts@arm.com> wrote: > On 17/10/2024 13:27, Petr Tesarik wrote: > > On Mon, 14 Oct 2024 11:55:11 +0100 > > Ryan Roberts <ryan.roberts@arm.com> wrote: > > > >> [...] > >> The series is arranged as follows: > >> > >> - patch 1: Add macros required for converting non-arch code to support > >> boot-time page size selection > >> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from all > >> non-arch code > > > > I have just tried to recompile the openSUSE kernel with these patches > > applied, and I'm running into this: > > > > CC arch/arm64/hyperv/hv_core.o > > In file included from ../arch/arm64/hyperv/hv_core.c:14:0: > > ../include/linux/hyperv.h:158:5: error: variably modified ‘reserved2’ at file scope > > u8 reserved2[PAGE_SIZE - 68]; > > ^~~~~~~~~ > > > > It looks like one more place which needs a patch, right? > > As mentioned in the cover letter, so far I've only converted enough to get the > defconfig *image* building (i.e. no modules). If you are compiling a different > config or compiling the modules for defconfig, you will likely run into these > types of issues. > > That said, I do have some patches to fix Hyper-V, which Michael Kelley was kind > enough to send me. > > I understand that Suse might be able to help with wider performance testing - if > that's the reason you are trying to compile, you could send me your config and > I'll start working on fixing up other drivers? You're right, performance testing is my goal. Heh, the openSUSE master config is cranked up to max. ;-) That would be a lot of work, and we don't need all those options for running our test suite. Let me disable the conflicting options instead. For reference, here's a long (yet incomplete) list of kernel options that conflict with this v1 patch series: # already handled by Michael CONFIG_HYPERV # sorry, Windows CONFIG_CIFS CONFIG_NTFS3_FS # no, not even with ntfs-3g CONFIG_FUSE_FS # bye-bye ZSWAP CONFIG_ZBUD CONFIG_Z3FOLD CONFIG_ZSMALLOC # ah, also bye-bye ZRAM # who needs redundancy? CONFIG_DM_RAID CONFIG_MD_RAID1 CONFIG_MD_RAID456 CONFIG_MD_RAID10 # who needs security? CONFIG_SECURITY_SELINUX # or integrity? CONFIG_IMA CONFIG_DM_INTEGRITY # or even crypto (this disables A LOT of stuff)... CONFIG_CRYPTO_MANAGER2 # meh... CONFIG_ARM_SMMU_V3_SVA CONFIG_ACPI_NFIT CONFIG_DEV_DAX_PMEM CONFIG_NVDIMM CONFIG_MTD_SWAP CONFIG_MLXBF_PMC CONFIG_THUNDERX2_PMU CONFIG_LKDTM CONFIG_VMWARE_VMCI CONFIG_HT16K33 CONFIG_FB_TFT_HX8340BN CONFIG_FB_TFT_ILI9341 CONFIG_DVB_FIREDTV CONFIG_DVB_PT3 CONFIG_VIDEO_ET8EK8 CONFIG_VIDEO_IVTV CONFIG_VIDEO_SAA7164 CONFIG_DRM_AMDGPU CONFIG_DRM_POWERVR CONFIG_DRM_QXL CONFIG_DRM_RADEON CONFIG_DRM_VMWGFX CONFIG_FIREWIRE_OHCI CONFIG_SND_SEQ_MIDI CONFIG_SND_DARLA20 CONFIG_SND_GINA20 CONFIG_SND_LAYLA20 CONFIG_SND_DARLA24 CONFIG_SND_DARLA24 CONFIG_SND_GINA24 CONFIG_SND_MONA CONFIG_SND_MIA CONFIG_SND_ECHO3G CONFIG_SND_INDIGO CONFIG_SND_INDIGOIO CONFIG_SND_INDIGODJ CONFIG_SND_INDIGOIOX CONFIG_SND_INDIGODJX CONFIG_SND_BCM63XX_I2S_WHISTLER CONFIG_SND_SOC_SOF CONFIG_SND_SOC_SPRD CONFIG_SND_SOC_STM32_SAI CONFIG_SND_SOC_STM32_I2S CONFIG_SND_SOC_STM32_SPDIFRX CONFIG_SND_SOC_STM32_DFSDM CONFIG_SND_SOC_TEGRA CONFIG_SND_SOC_CROS_EC_CODEC CONFIG_SND_SOC_RT5514_SPI CONFIG_SND_USB_UA101 CONFIG_USB_F_PHONET CONFIG_USB_F_TCM CONFIG_SPI_LOOPBACK_TEST CONFIG_W1 CONFIG_RDS CONFIG_TIPC CONFIG_TCP_SIGPOOL CONFIG_OPENVSWITCH CONFIG_NIU CONFIG_QED_SRIOV CONFIG_SFC CONFIG_SFC_FALCON CONFIG_SFC_SIENA CONFIG_TSNEP CONFIG_LIBERTAS CONFIG_LOOPBACK_TARGET CONFIG_SUNRPC_XPRT_RDMA CONFIG_INFINIBAND_HNS CONFIG_INFINIBAND_IPOIB CONFIG_INFINIBAND_EFA CONFIG_INFINIBAND_MTHCA CONFIG_MLX4_CORE CONFIG_MLX4_INFINIBAND CONFIG_MLX5_CORE CONFIG_MLX5_INFINIBAND CONFIG_MLX5_VDPA_NET CONFIG_MLX5_VFIO_PCI CONFIG_ISCSI_TCP CONFIG_SCSI_CXGB3_ISCSI CONFIG_SCSI_CXGB4_ISCSI CONFIG_SCSI_DC395x CONFIG_SCSI_DMX3191D CONFIG_SCSI_FDOMAIN CONFIG_SCSI_MVUMI CONFIG_SCSI_STEX CONFIG_SCSI_SYM53C8XX_2 CONFIG_CDROM_PKTCDVD CONFIG_AFS_FS CONFIG_BCACHE CONFIG_BCACHEFS_FS CONFIG_CEPH_FS CONFIG_DLM CONFIG_BLK_DEV_NULL_BLK CONFIG_BLK_DEV_DRBD CONFIG_BLK_DEV_RBD CONFIG_OCFS2_FS CONFIG_CRAMFS CONFIG_EROFS_FS CONFIG_ECRYPT_FS CONFIG_F2FS_FS CONFIG_ZISOFS CONFIG_NFS_V3_ACL # would be nice to have... CONFIG_NFSD_V4 CONFIG_SUNRPC_BACKCHANNEL # required by CONFIG_NFS_V4_1 CONFIG_MMC CONFIG_NVME_CORE CONFIG_NVMEM # required by CONFIG_USB4 CONFIG_USB_UAS CONFIG_BLK_DEV_DM # ...but this is kind of really necessary CONFIG_BTRFS_FS After disabling all the above and exporting ptg_page_shift, the tumbleweed kernel builds. TBH I expected more broken things. Great success! ;-) I'll see if I can do something about btrfs. Then I can try to boot the kernel... Petr T
On Fri, 18 Oct 2024 14:56:00 +0200 Petr Tesarik <ptesarik@suse.com> wrote: > On Thu, 17 Oct 2024 13:32:43 +0100 > Ryan Roberts <ryan.roberts@arm.com> wrote: > > > On 17/10/2024 13:27, Petr Tesarik wrote: > > > On Mon, 14 Oct 2024 11:55:11 +0100 > > > Ryan Roberts <ryan.roberts@arm.com> wrote: > > > > > >> [...] > > >> The series is arranged as follows: > > >> > > >> - patch 1: Add macros required for converting non-arch code to support > > >> boot-time page size selection > > >> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from all > > >> non-arch code > > > > > > I have just tried to recompile the openSUSE kernel with these patches > > > applied, and I'm running into this: > > > > > > CC arch/arm64/hyperv/hv_core.o > > > In file included from ../arch/arm64/hyperv/hv_core.c:14:0: > > > ../include/linux/hyperv.h:158:5: error: variably modified ‘reserved2’ at file scope > > > u8 reserved2[PAGE_SIZE - 68]; > > > ^~~~~~~~~ > > > > > > It looks like one more place which needs a patch, right? > > > > As mentioned in the cover letter, so far I've only converted enough to get the > > defconfig *image* building (i.e. no modules). If you are compiling a different > > config or compiling the modules for defconfig, you will likely run into these > > types of issues. > > > > That said, I do have some patches to fix Hyper-V, which Michael Kelley was kind > > enough to send me. > > > > I understand that Suse might be able to help with wider performance testing - if > > that's the reason you are trying to compile, you could send me your config and > > I'll start working on fixing up other drivers? > > You're right, performance testing is my goal. > > Heh, the openSUSE master config is cranked up to max. ;-) That would be > a lot of work, and we don't need all those options for running our test > suite. Let me disable the conflicting options instead. >[...] > I'll see if I can do something about btrfs. Then I can try to boot the > kernel... FWIW the kernel builds and _boots_ after applying this patch: fs/btrfs/compression.h | 2 +- fs/btrfs/defrag.c | 2 +- fs/btrfs/extent_io.h | 2 +- fs/btrfs/scrub.c | 2 +- include/linux/raid/pq.h | 4 ++-- lib/raid6/algos.c | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -33,7 +33,7 @@ struct btrfs_bio; /* Maximum length of compressed data stored on disk */ #define BTRFS_MAX_COMPRESSED (SZ_128K) #define BTRFS_MAX_COMPRESSED_PAGES (BTRFS_MAX_COMPRESSED / PAGE_SIZE) -static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); +static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE_MAX) == 0); /* Maximum size of data before compression */ #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -1144,7 +1144,7 @@ next: } #define CLUSTER_SIZE (SZ_256K) -static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); +static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE_MAX)); /* * Defrag one contiguous target range. --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -89,7 +89,7 @@ enum { int __init extent_buffer_init_cachep(void); void __cold extent_buffer_free_cachep(void); -#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE) +#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE_MIN) struct extent_buffer { u64 start; u32 len; --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -100,7 +100,7 @@ enum scrub_stripe_flags { SCRUB_STRIPE_FLAG_NO_REPORT, }; -#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE) +#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE_MIN) /* * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. --- a/include/linux/raid/pq.h +++ b/include/linux/raid/pq.h @@ -12,7 +12,7 @@ #include <linux/blkdev.h> -extern const char raid6_empty_zero_page[PAGE_SIZE]; +extern const char raid6_empty_zero_page[PAGE_SIZE_MAX]; #else /* ! __KERNEL__ */ /* Used for testing in user space */ @@ -39,7 +39,7 @@ typedef uint64_t u64; #ifndef PAGE_SHIFT # define PAGE_SHIFT 12 #endif -extern const char raid6_empty_zero_page[PAGE_SIZE]; +extern const char raid6_empty_zero_page[PAGE_SIZE_MAX]; #define __init #define __exit --- a/lib/raid6/algos.c +++ b/lib/raid6/algos.c @@ -19,7 +19,7 @@ #include <linux/module.h> #include <linux/gfp.h> /* In .bss so it's zeroed */ -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); +const char raid6_empty_zero_page[PAGE_SIZE_MAX] __attribute__((aligned(256))); EXPORT_SYMBOL(raid6_empty_zero_page); #endif Petr T
On 18/10/2024 15:41, Petr Tesarik wrote: > On Fri, 18 Oct 2024 14:56:00 +0200 > Petr Tesarik <ptesarik@suse.com> wrote: > >> On Thu, 17 Oct 2024 13:32:43 +0100 >> Ryan Roberts <ryan.roberts@arm.com> wrote: >> >>> On 17/10/2024 13:27, Petr Tesarik wrote: >>>> On Mon, 14 Oct 2024 11:55:11 +0100 >>>> Ryan Roberts <ryan.roberts@arm.com> wrote: >>>> >>>>> [...] >>>>> The series is arranged as follows: >>>>> >>>>> - patch 1: Add macros required for converting non-arch code to support >>>>> boot-time page size selection >>>>> - patches 2-36: Remove PAGE_SIZE compile-time constant assumption from all >>>>> non-arch code >>>> >>>> I have just tried to recompile the openSUSE kernel with these patches >>>> applied, and I'm running into this: >>>> >>>> CC arch/arm64/hyperv/hv_core.o >>>> In file included from ../arch/arm64/hyperv/hv_core.c:14:0: >>>> ../include/linux/hyperv.h:158:5: error: variably modified ‘reserved2’ at file scope >>>> u8 reserved2[PAGE_SIZE - 68]; >>>> ^~~~~~~~~ >>>> >>>> It looks like one more place which needs a patch, right? >>> >>> As mentioned in the cover letter, so far I've only converted enough to get the >>> defconfig *image* building (i.e. no modules). If you are compiling a different >>> config or compiling the modules for defconfig, you will likely run into these >>> types of issues. >>> >>> That said, I do have some patches to fix Hyper-V, which Michael Kelley was kind >>> enough to send me. >>> >>> I understand that Suse might be able to help with wider performance testing - if >>> that's the reason you are trying to compile, you could send me your config and >>> I'll start working on fixing up other drivers? >> >> You're right, performance testing is my goal. >> >> Heh, the openSUSE master config is cranked up to max. ;-) That would be >> a lot of work, and we don't need all those options for running our test >> suite. Let me disable the conflicting options instead. >> [...] >> I'll see if I can do something about btrfs. Then I can try to boot the >> kernel... > > FWIW the kernel builds and _boots_ after applying this patch: Amazing - thanks for doing this! > > fs/btrfs/compression.h | 2 +- > fs/btrfs/defrag.c | 2 +- > fs/btrfs/extent_io.h | 2 +- > fs/btrfs/scrub.c | 2 +- > include/linux/raid/pq.h | 4 ++-- > lib/raid6/algos.c | 2 +- > 6 files changed, 7 insertions(+), 7 deletions(-) > > --- a/fs/btrfs/compression.h > +++ b/fs/btrfs/compression.h > @@ -33,7 +33,7 @@ struct btrfs_bio; > /* Maximum length of compressed data stored on disk */ > #define BTRFS_MAX_COMPRESSED (SZ_128K) > #define BTRFS_MAX_COMPRESSED_PAGES (BTRFS_MAX_COMPRESSED / PAGE_SIZE) > -static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); > +static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE_MAX) == 0); > > /* Maximum size of data before compression */ > #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) > --- a/fs/btrfs/defrag.c > +++ b/fs/btrfs/defrag.c > @@ -1144,7 +1144,7 @@ next: > } > > #define CLUSTER_SIZE (SZ_256K) > -static_assert(PAGE_ALIGNED(CLUSTER_SIZE)); > +static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE_MAX)); > > /* > * Defrag one contiguous target range. > --- a/fs/btrfs/extent_io.h > +++ b/fs/btrfs/extent_io.h > @@ -89,7 +89,7 @@ enum { > int __init extent_buffer_init_cachep(void); > void __cold extent_buffer_free_cachep(void); > > -#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE) > +#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE_MIN) While this works, I'm not sure if you would want to have 2 separate macros; 1 for worst-case static allocation, and 1 for dynamic allocation and iterating. I could imagine if you allocate PAGE_SIZE_MAX pages into the worst case number of slots that would increase memory. I'm not familiar with the code so don't know if this is a problem in practice. Certainly what you have done is much simpler if acceptable. > struct extent_buffer { > u64 start; > u32 len; > --- a/fs/btrfs/scrub.c > +++ b/fs/btrfs/scrub.c > @@ -100,7 +100,7 @@ enum scrub_stripe_flags { > SCRUB_STRIPE_FLAG_NO_REPORT, > }; > > -#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE) > +#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE_MIN) Same comment. Thanks, Ryan > > /* > * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. > --- a/include/linux/raid/pq.h > +++ b/include/linux/raid/pq.h > @@ -12,7 +12,7 @@ > > #include <linux/blkdev.h> > > -extern const char raid6_empty_zero_page[PAGE_SIZE]; > +extern const char raid6_empty_zero_page[PAGE_SIZE_MAX]; > > #else /* ! __KERNEL__ */ > /* Used for testing in user space */ > @@ -39,7 +39,7 @@ typedef uint64_t u64; > #ifndef PAGE_SHIFT > # define PAGE_SHIFT 12 > #endif > -extern const char raid6_empty_zero_page[PAGE_SIZE]; > +extern const char raid6_empty_zero_page[PAGE_SIZE_MAX]; > > #define __init > #define __exit > --- a/lib/raid6/algos.c > +++ b/lib/raid6/algos.c > @@ -19,7 +19,7 @@ > #include <linux/module.h> > #include <linux/gfp.h> > /* In .bss so it's zeroed */ > -const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); > +const char raid6_empty_zero_page[PAGE_SIZE_MAX] __attribute__((aligned(256))); > EXPORT_SYMBOL(raid6_empty_zero_page); > #endif > > > Petr T
> Performance Testing > =================== > > I've run some limited performance benchmarks: > > First, a real-world benchmark that causes a lot of page table manipulation (and > therefore we would expect to see regression here if we are going to see it > anywhere); kernel compilation. It barely registers a change. Values are times, > so smaller is better. All relative to base-4k: > > | | kern | kern | user | user | real | real | > | config | mean | stdev | mean | stdev | mean | stdev | > |-------------|---------|---------|---------|---------|---------|---------| > | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | > | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | > | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | > > The Speedometer JavaScript benchmark also shows no change. Values are runs per > min, so bigger is better. All relative to base-4k: > > | config | mean | stdev | > |-------------|---------|---------| > | base-4k | 0.0% | 0.8% | > | compile-4k | 0.4% | 0.8% | > | boot-4k | 0.0% | 0.9% | > > Finally, I've run some microbenchmarks known to stress page table manipulations > (originally from David Hildenbrand). The fork test maps/allocs 1G of anon > memory, then measures the cost of fork(). The munmap test maps/allocs 1G of anon > memory then measures the cost of munmap()ing it. The fork test is known to be > extremely sensitive to any changes that cause instructions to be aligned > differently in cachelines. When using this test for other changes, I've seen > double digit regressions for the slightest thing, so 12% regression on this test > is actually fairly good. This likely represents the extreme worst case for > regressions that will be observed across other microbenchmarks (famous last > words). Values are times, so smaller is better. All relative to base-4k: > ... and here I am, worrying about much smaller degradation in these micro-benchmark ;) You're right, these are pure micro-benchmarks, and while 12% does sound like "much", even stupid compiler code movement can result in such changes in the fork() micro benchmark. So I think this is just fine, and actually "surprisingly" small. And, there is even a way to statically compile a page size and not worry about that at all. As discussed ahead of times, I consider this change very valuable. In RHEL, the biggest issue is actually the test matrix, that cannot really be reduced significantly ... but it will make shipping/packaging easier. CCing Don, who did the separate 64k RHEL flavor kernel. -- Cheers, David / dhildenb
On 16/10/2024 16:16, David Hildenbrand wrote: >> Performance Testing >> =================== >> >> I've run some limited performance benchmarks: >> >> First, a real-world benchmark that causes a lot of page table manipulation (and >> therefore we would expect to see regression here if we are going to see it >> anywhere); kernel compilation. It barely registers a change. Values are times, >> so smaller is better. All relative to base-4k: >> >> | | kern | kern | user | user | real | real | >> | config | mean | stdev | mean | stdev | mean | stdev | >> |-------------|---------|---------|---------|---------|---------|---------| >> | base-4k | 0.0% | 1.1% | 0.0% | 0.3% | 0.0% | 0.3% | >> | compile-4k | -0.2% | 1.1% | -0.2% | 0.3% | -0.1% | 0.3% | >> | boot-4k | 0.1% | 1.0% | -0.3% | 0.2% | -0.2% | 0.2% | >> >> The Speedometer JavaScript benchmark also shows no change. Values are runs per >> min, so bigger is better. All relative to base-4k: >> >> | config | mean | stdev | >> |-------------|---------|---------| >> | base-4k | 0.0% | 0.8% | >> | compile-4k | 0.4% | 0.8% | >> | boot-4k | 0.0% | 0.9% | >> >> Finally, I've run some microbenchmarks known to stress page table manipulations >> (originally from David Hildenbrand). The fork test maps/allocs 1G of anon >> memory, then measures the cost of fork(). The munmap test maps/allocs 1G of anon >> memory then measures the cost of munmap()ing it. The fork test is known to be >> extremely sensitive to any changes that cause instructions to be aligned >> differently in cachelines. When using this test for other changes, I've seen >> double digit regressions for the slightest thing, so 12% regression on this test >> is actually fairly good. This likely represents the extreme worst case for >> regressions that will be observed across other microbenchmarks (famous last >> words). Values are times, so smaller is better. All relative to base-4k: >> > > ... and here I am, worrying about much smaller degradation in these micro- > benchmark ;) You're right, these are pure micro-benchmarks, and while 12% does > sound like "much", even stupid compiler code movement can result in such changes > in the fork() micro benchmark. > > So I think this is just fine, and actually "surprisingly" small. And, there is > even a way to statically compile a page size and not worry about that at all. > > As discussed ahead of times, I consider this change very valuable. In RHEL, the > biggest issue is actually the test matrix, that cannot really be reduced > significantly ... but it will make shipping/packaging easier. > > CCing Don, who did the separate 64k RHEL flavor kernel. > Thanks, David! I'm planning to investigate and see if I can improve even on that 12%. I have a couple of ideas. But like you say, I don't think this should be a blocker to moving forwards.
On 10/14/24 03:55, Ryan Roberts wrote: > Hi All, > > Patch bomb incoming... This covers many subsystems, so I've included a core set > of people on the full series and additionally included maintainers on relevant > patches. I haven't included those maintainers on this cover letter since the > numbers were far too big for it to work. But I've included a link to this cover > letter on each patch, so they can hopefully find their way here. For follow up > submissions I'll break it up by subsystem, but for now thought it was important > to show the full picture. > > This RFC series implements support for boot-time page size selection within the > arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to date, page > size has been selected at compile-time, meaning the size is baked into a given > kernel image. As use of larger-than-4K page sizes become more prevalent this > starts to present a problem for distributions. Boot-time page size selection > enables the creation of a single kernel image, which can be told which page size > to use on the kernel command line. > > Why is having an image-per-page size problematic? > ================================================= > > Many traditional distros are now supporting both 4K and 64K. And this means > managing 2 kernel packages, along with drivers for each. For some, it means > multiple installer flavours and multiple ISOs. All of this adds up to a > less-than-ideal level of complexity. Additionally, Android now supports 4K and > 16K kernels. I'm told having to explicitly manage their KABI for each kernel is > painful, and the extra flash space required for both kernel images and the > duplicated modules has been problematic. Boot-time page size selection solves > all of this. > > Additionally, in starting to think about the longer term deployment story for > D128 page tables, which Arm architecture now supports, a lot of the same > problems need to be solved, so this work sets us up nicely for that. > > So what's the down side? > ======================== > > Well nothing's free; Various static allocations in the kernel image must be > sized for the worst case (largest supported page size), so image size is in line > with size of 64K compile-time image. So if you're interested in 4K or 16K, there > is a slight increase to the image size. But I expect that problem goes away if > you're compressing the image - its just some extra zeros. At boot-time, I expect > we could free the unused static storage once we know the page size - although > that would be a follow up enhancement. > > And then there is performance. Since PAGE_SIZE and friends are no longer > compile-time constants, we must look up their values and do arithmetic at > runtime instead of compile-time. My early perf testing suggests this is > inperceptible for real-world workloads, and only has small impact on > microbenchmarks - more on this below. > > Approach > ======== > > The basic idea is to rid the source of any assumptions that PAGE_SIZE and > friends are compile-time constant, but in a way that allows the compiler to > perform the same optimizations as was previously being done if they do turn out > to be compile-time constant. Where constants are required, we use limits; > PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full description > of all the classes of problems to solve. > > By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to > boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. arm64 > does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE Kconfig, > which is an alternative to selecting a compile-time page size. > > When boot-time page size is active, the arch pgtable geometry macro definitions > resolve to something that can be configured at boot. The arm64 implementation in > this series mainly uses global, __ro_after_init variables. I've tried using > alternatives patching, but that performs worse than loading from memory; I think > due to code size bloat. FWIW, this paragraph was not entirely clear to me until I looked at patch 57 to see that the compile time page size selection had been retained, and could continue to be used as-is. It was somewhat implicit, but not IMHO explicit enough, not a big deal though. Great work, thanks for doing that! This makes me wonder if we could leverage any of that to have a single kernel supporting both LPAE and !LPAE on ARM 32-bit, but that still seems like somewhat more difficult, largely due to the difference in the page table descriptor format (long vs. short). -- Florian
On 14/10/2024 18:32, Florian Fainelli wrote: > On 10/14/24 03:55, Ryan Roberts wrote: >> Hi All, >> >> Patch bomb incoming... This covers many subsystems, so I've included a core set >> of people on the full series and additionally included maintainers on relevant >> patches. I haven't included those maintainers on this cover letter since the >> numbers were far too big for it to work. But I've included a link to this cover >> letter on each patch, so they can hopefully find their way here. For follow up >> submissions I'll break it up by subsystem, but for now thought it was important >> to show the full picture. >> >> This RFC series implements support for boot-time page size selection within the >> arm64 kernel. arm64 supports 3 base page sizes (4K, 16K, 64K), but to date, page >> size has been selected at compile-time, meaning the size is baked into a given >> kernel image. As use of larger-than-4K page sizes become more prevalent this >> starts to present a problem for distributions. Boot-time page size selection >> enables the creation of a single kernel image, which can be told which page size >> to use on the kernel command line. >> >> Why is having an image-per-page size problematic? >> ================================================= >> >> Many traditional distros are now supporting both 4K and 64K. And this means >> managing 2 kernel packages, along with drivers for each. For some, it means >> multiple installer flavours and multiple ISOs. All of this adds up to a >> less-than-ideal level of complexity. Additionally, Android now supports 4K and >> 16K kernels. I'm told having to explicitly manage their KABI for each kernel is >> painful, and the extra flash space required for both kernel images and the >> duplicated modules has been problematic. Boot-time page size selection solves >> all of this. >> >> Additionally, in starting to think about the longer term deployment story for >> D128 page tables, which Arm architecture now supports, a lot of the same >> problems need to be solved, so this work sets us up nicely for that. >> >> So what's the down side? >> ======================== >> >> Well nothing's free; Various static allocations in the kernel image must be >> sized for the worst case (largest supported page size), so image size is in line >> with size of 64K compile-time image. So if you're interested in 4K or 16K, there >> is a slight increase to the image size. But I expect that problem goes away if >> you're compressing the image - its just some extra zeros. At boot-time, I expect >> we could free the unused static storage once we know the page size - although >> that would be a follow up enhancement. >> >> And then there is performance. Since PAGE_SIZE and friends are no longer >> compile-time constants, we must look up their values and do arithmetic at >> runtime instead of compile-time. My early perf testing suggests this is >> inperceptible for real-world workloads, and only has small impact on >> microbenchmarks - more on this below. >> >> Approach >> ======== >> >> The basic idea is to rid the source of any assumptions that PAGE_SIZE and >> friends are compile-time constant, but in a way that allows the compiler to >> perform the same optimizations as was previously being done if they do turn out >> to be compile-time constant. Where constants are required, we use limits; >> PAGE_SIZE_MIN and PAGE_SIZE_MAX. See commit log in patch 1 for full description >> of all the classes of problems to solve. >> >> By default PAGE_SIZE_MIN=PAGE_SIZE_MAX=PAGE_SIZE. But an arch may opt-in to >> boot-time page size selection by defining PAGE_SIZE_MIN & PAGE_SIZE_MAX. arm64 >> does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE Kconfig, >> which is an alternative to selecting a compile-time page size. >> >> When boot-time page size is active, the arch pgtable geometry macro definitions >> resolve to something that can be configured at boot. The arm64 implementation in >> this series mainly uses global, __ro_after_init variables. I've tried using >> alternatives patching, but that performs worse than loading from memory; I think >> due to code size bloat. > > FWIW, this paragraph was not entirely clear to me until I looked at patch 57 to > see that the compile time page size selection had been retained, and could > continue to be used as-is. It was somewhat implicit, but not IMHO explicit > enough, not a big deal though. I intended to make that bit clear with the above sentance "arm64 does this if the user selects the CONFIG_ARM64_BOOT_TIME_PAGE_SIZE Kconfig, which is an alternative to selecting a compile-time page size.", but appreciate there is a lot going on here. > > Great work, thanks for doing that! This makes me wonder if we could leverage any > of that to have a single kernel supporting both LPAE and !LPAE on ARM 32-bit, > but that still seems like somewhat more difficult, largely due to the difference > in the page table descriptor format (long vs. short). We will eventually have the exact same problem with FEAT_D128 on arm64. This introduces page tables with 128 bit PTEs. Ideally we would like to support both in a single image, although, we have much more thinking to do on that. But my current view is that this series solves a bunch of problems that makes it easier (PTRS_PER_Pxx and Pxx_SHIFT all become boot-time values, for example, so we can easily represent the different geometries). Yes, we still need to solve the PTE size difference (in our case 64-bit vs 128-bit). I have a couple of proposals for how to do that; the "gold-plated" approach would be to create and use a handle type to represent a PTE/PxD slot in a table. Then increments/decrements would be enforced via explicit helpers that know the size, and direct dereferencing would be impossible. When accessing via helpers we would pass around pte_t/pxd_t values that are the larger size, then narrow then when writing back. Anshuman has a series [1] that starts to move in that direction. If you have any other ideas, it would be good to talk! [1] https://lore.kernel.org/linux-mm/20240917073117.1531207-1-anshuman.khandual@arm.com/ Thanks, Ryan
© 2016 - 2024 Red Hat, Inc.